In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:

print("Loading data...")

df = pd.read_csv('../data/dataset.csv')

In [None]:
transactions = []
for i in range(len(df)):
    
    row_values = df.iloc[i, 1:].dropna().tolist()
    
    clean_row = [x.strip().replace('_', ' ') for x in row_values]
    transactions.append(clean_row)

print(f"Success! Processed {len(transactions)} patient cases.")

In [None]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
print("Running Apriori with stricter limits...")


frequent_itemsets = apriori(df_encoded, 
                            min_support=0.05, 
                            use_colnames=True, 
                            max_len=3,   
                            low_memory=True)

print(f"Found {len(frequent_itemsets)} frequent itemsets.")


if len(frequent_itemsets) > 0:
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
    rules = rules.sort_values(by='lift', ascending=False)
    print(f"Generated {len(rules)} rules.")
else:
    print("No itemsets found! Try lowering min_support to 0.03.")

In [None]:

rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_csv('../results/rules.csv', index=False)
print("Saved rules to results/rules.csv")

In [None]:

plt.figure(figsize=(10, 6))
count = df_encoded.sum().sort_values(ascending=False).head(15)
sns.barplot(x=count.values, y=count.index, palette='viridis')
plt.title('Top 15 Most Frequent Symptoms')
plt.xlabel('Number of Patients')
plt.savefig('../results/symptom_chart.png') 
print("Saved chart to results/symptom_chart.png")
plt.show()