## Association rule mining simulation

### Dependencies

In [1]:
from drug_couse.arm.build import (
    simulated_patient_drug_combination,
    DrugAssociationAnalyzer,
    compare_file_sizes,
    load_parquet_data,
)
import os
from pathlib import Path

working_dir = Path("C:/my_disk/____tmp/drug_couse")
Path(working_dir).mkdir(parents=True, exist_ok=True)
os.chdir(working_dir)

### simulated_patient_drug_combination

In [2]:
df = simulated_patient_drug_combination(1000)
df.head()

Unnamed: 0,pat_id,drg_combination,mode_of_dispensation
0,6437,"Drug A, Drug B, Drug 2","capsule, capsule, capsule"
1,3480,"Drug X, Drug Y, Drug W","cream, cream, cream"
2,9984,"Drug X, Drug Y, Drug W","cream, cream, ointment"
3,8033,"Drug 1, Drug 2","pill, tablet"
4,5610,"Drug A, Drug B, Drug D","tablet, capsule, capsule"


### Automatically analyze ALL drug pairs with enhanced export

In [3]:
print("\nüöÄ Creating Complete Drug Relationships Matrix:")
print("=" * 60)


üöÄ Creating Complete Drug Relationships Matrix:


### Initialize analyzer

In [4]:
analyzer = DrugAssociationAnalyzer(df)

# Step 1: Preprocess data
transactions = analyzer.preprocess_data()
print("\nüìã Transaction Matrix Shape: {transactions.shape}")
print("Sample of transaction matrix:")
print(transactions.head())

# Step 2: Get basic statistics
analyzer.get_drug_statistics()

# Step 3: Find frequent itemsets
# Start with low min_support since we have small dataset
frequent_itemsets = analyzer.find_frequent_itemsets(min_support=0.2)

if frequent_itemsets is not None:
    print("\nüìä Top 10 Frequent Itemsets:")
    print(frequent_itemsets.nlargest(10, "support"))

# Step 4: Generate association rules
rules = analyzer.generate_association_rules(metric="confidence", min_threshold=0.5)

all_pairs_matrix = analyzer.create_all_pairs_relationship_matrix(
    export_to_excel=True, filename="complete_drug_relationships_matrix.xlsx"
)


if all_pairs_matrix is not None:
    print("\nüìà Complete Analysis Summary:")
    print(f"   Total drug pairs analyzed: {len(all_pairs_matrix):,}")
    print(
        f"   High confidence relationships (60%+): {len(all_pairs_matrix[all_pairs_matrix['Confidence'] >= 0.6]):,}"
    )
    print(
        f"   Strong associations (Lift 2.0+): {len(all_pairs_matrix[all_pairs_matrix['Lift'] >= 2.0]):,}"
    )
    print(
        f"   High priority relationships: {len(all_pairs_matrix[all_pairs_matrix['Clinical_Priority'] == 'High']):,}"
    )

    # Compare file sizes for the complete matrix
    compare_file_sizes("complete_drug_relationships_matrix")

    print("\nüìÅ Output Files Generated:")
    print("   üìä Excel File: complete_drug_relationships_matrix.xlsx")
    print("      ‚Ä¢ Multiple sheets with filtered views")
    print("      ‚Ä¢ Pivot tables for easy analysis")
    print("      ‚Ä¢ Summary statistics")
    print("   üì¶ Parquet Files: Multiple .parquet files for different views")
    print("      ‚Ä¢ Faster loading for large datasets")
    print("      ‚Ä¢ Better compression than CSV")
    print("      ‚Ä¢ Preserves data types")

üîÑ Preprocessing data...
‚úÖ Data preprocessed: 1000 patients, 12 unique drugs

üìã Transaction Matrix Shape: {transactions.shape}
Sample of transaction matrix:
   Drug 1  Drug 2  Drug 3  Drug 4  Drug A  Drug B  Drug C  Drug D  Drug W  \
0   False    True   False   False    True    True   False   False   False   
1   False   False   False   False   False   False   False   False    True   
2   False   False   False   False   False   False   False   False    True   
3    True    True   False   False   False   False   False   False   False   
4   False   False   False   False    True    True   False    True   False   

   Drug X  Drug Y  Drug Z  
0   False   False   False  
1    True    True   False  
2    True    True   False  
3   False   False   False  
4   False   False   False  

üìà Drug Usage Statistics:
Total patients: 1000
Total unique drugs: 12

Top 10 most prescribed drugs:
  Drug X: 374 patients (37.4%)
  Drug Y: 358 patients (35.8%)
  Drug 2: 356 patients (35.6%)
  Drug 1

### Demonstration of loading parquet data

In [5]:
print("\nüîÑ Demonstration: Loading Parquet Data")
sample_parquet = load_parquet_data("complete_drug_relationships_matrix.parquet")
if sample_parquet is not None:
    print("   Sample of loaded data:")
    print(
        f"   {sample_parquet.head(3)[['Drug_A', 'Drug_B', 'Confidence_%', 'Lift', 'Rule_Strength']].to_string(index=False)}"
    )

print("\n" + "=" * 60)
print("üéâ Enhanced Analysis Complete with Dual Export!")
print("üìã Key Features:")
print("   ‚úÖ Excel export with multiple sheets")
print("   ‚úÖ Parquet export for large datasets")
print("   ‚úÖ Automatic handling of Excel row limits")
print("   ‚úÖ File size comparisons")
print("   ‚úÖ Easy data loading utilities")
print("=" * 60)


üîÑ Demonstration: Loading Parquet Data
‚úÖ Loaded parquet file: complete_drug_relationships_matrix.parquet
   Shape: (96, 14)
   Columns: ['Drug_A', 'Drug_B', 'Patients_A', 'Patients_B', 'Patients_Both', 'Support', 'Confidence', 'Confidence_%', 'Lift', 'Leverage', 'Conviction', 'Rule_Strength', 'Clinical_Priority', 'Relationship_Type']
   Sample of loaded data:
   Drug_A Drug_B  Confidence_%  Lift Rule_Strength
Drug D Drug B         100.0 2.890   Very Strong
Drug C Drug B         100.0 2.890   Very Strong
Drug D Drug A         100.0 2.874   Very Strong

üéâ Enhanced Analysis Complete with Dual Export!
üìã Key Features:
   ‚úÖ Excel export with multiple sheets
   ‚úÖ Parquet export for large datasets
   ‚úÖ Automatic handling of Excel row limits
   ‚úÖ File size comparisons
   ‚úÖ Easy data loading utilities


### Self run

In [None]:
"""
cd .\tests\
uv run .\arm_test.py
"""