In [52]:
# imprts
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import xgboost as xgb

In [53]:
df = pd.read_csv('data/malaria_dataset.csv')

In [54]:
print(f"Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum()}")
df.head()

Shape: (1622, 21)
Missing values: IP_Number               0
Age                     0
Sex                     0
Residence_Area          0
DOA                     0
Discharge_Date          0
Fever                   0
Headache                0
Abdominal_Pain          0
General_Body_Malaise    0
Dizziness               0
Vomiting                0
Confusion               0
Backache                0
Chest_Pain              0
Coughing                0
Joint_Pain              0
Primary_Code            0
Diagnosis_Type          0
Target                  0
Risk_Score              0
dtype: int64


Unnamed: 0,IP_Number,Age,Sex,Residence_Area,DOA,Discharge_Date,Fever,Headache,Abdominal_Pain,General_Body_Malaise,...,Vomiting,Confusion,Backache,Chest_Pain,Coughing,Joint_Pain,Primary_Code,Diagnosis_Type,Target,Risk_Score
0,14xxxx31,52,Female,Mangalore,31-10-2015 20:42,05-11-2015 05:16,0,0,0,1,...,0,0,1,0,0,0,B50.9,Mixed Malaria Infection,0,3
1,28xxxx34,75,Female,Shimoga,03-02-2015 23:28,13-02-2015 19:27,1,0,1,1,...,0,1,0,1,1,1,B50.9,Mixed Malaria Infection,1,11
2,96xxxx43,30,Female,Mangalore,15-11-2019 12:31,19-11-2019 14:31,1,1,1,1,...,0,1,1,1,0,1,B50.9,Mixed Malaria Infection,1,13
3,49xxxx87,89,Female,Mangalore,17-05-2017 17:50,23-05-2017 13:22,0,0,0,0,...,1,1,1,1,0,1,B54,Plasmodium vivax Malaria without complication,0,5
4,48xxxx10,62,Male,Shimoga,26-06-2015 15:29,27-06-2015 23:35,0,1,0,1,...,1,1,0,0,0,0,B51.0,Plasmodium falciparum Malaria without complica...,1,8


In [None]:
target = 'Target' 
print(df[target].value_counts())


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
df[target].value_counts().plot(kind='bar', ax=ax[0], title='Target Distribution')
df[target].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%')
plt.tight_layout()
plt.show()

# Feature Analysis

In [None]:
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

In [None]:
numeric_cols

In [None]:
categorical_cols

In [None]:
symptom_cols = [c for c in df.columns if c in [
    'Fever', 'Headache', 'Abdominal_Pain', 'General_Body_Malaise',
    'Dizziness', 'Vomiting', 'Confusion', 'Backache', 
    'Chest_Pain', 'Coughing', 'Joint_Pain'
]]

In [None]:
symptom_prev = df[symptom_cols].sum().sort_values(ascending=False)
plt.figure(figsize=(10, 5))
symptom_prev.plot(kind='barh')
plt.title('Symptom Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target
corr_with_target = df[symptom_cols + [target]].corr()[target].drop(target).sort_values(ascending=False)
print("Symptoms most correlated with malaria:")
print(corr_with_target.head(5))

In [None]:
print("\nAge distribution:")
print(df['Age'].describe())

print("\nSex distribution:")
print(df['Sex'].value_counts())


In [None]:
# Drop  columns
print("Original columns:", df.columns.tolist())
target = 'Target'
date_cols = [c for c in df.columns if 'date' in c.lower() or 'doa' in c.lower()]
columns_to_drop = [
    'IP_Number',          
    'DOA',                 
    'Discharge_Date',      
    'Primary_Code',       
    'Diagnosis_Type',     
    'Risk_Score'           
]

df_clean = df.drop(columns=columns_to_drop)
print(f"\n Dropped {len(columns_to_drop)} columns")
print(f"Remaining columns: {df_clean.columns.tolist()}")

In [None]:
# Encode categorical (Sex, Residence_Area, etc.)
cat_cols = df_clean.select_dtypes(include=['object']).columns
encoders = {}
for col in cat_cols:
    if col != target:
        le = LabelEncoder()
        df_clean[col] = le.fit_transform(df_clean[col].astype(str))
        encoders[col] = le

print(f"\n Clean dataset: {df_clean.shape}")

In [None]:
# 6. TRAIN-VAL-TEST SPLIT (10 min)
X = df_clean.drop(columns=[target])
y = df_clean[target]

X_full, X_test, y_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.25, random_state=42, stratify=y_full
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
feature_names = X.columns.tolist()

# Training

In [None]:
# LOGISTIC REGRESSION

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_val)
y_proba_lr = lr.predict_proba(X_val)[:, 1]

print("LOGISTIC REGRESSION")
print(f"Accuracy: {accuracy_score(y_val, y_pred_lr):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, y_proba_lr):.4f}")
print(classification_report(y_val, y_pred_lr))

#  DECISION TREE 
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=10, random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_val)
y_proba_dt = dt.predict_proba(X_val)[:, 1]

print("DECISION TREE")
print(f"Accuracy: {accuracy_score(y_val, y_pred_dt):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, y_proba_dt):.4f}")

# RANDOM FOREST 

rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)
y_proba_rf = rf.predict_proba(X_val)[:, 1]

print("RANDOM FOREST")
print(f"Accuracy: {accuracy_score(y_val, y_pred_rf):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, y_proba_rf):.4f}")

#  XGBOOST

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    eval_metric='auc'
)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_val)
y_proba_xgb = xgb_model.predict_proba(X_val)[:, 1]

print("XGBOOST")
print(f"Accuracy: {accuracy_score(y_val, y_pred_xgb):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, y_proba_xgb):.4f}")

#  MODEL COMPARISON
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost'],
    'Accuracy': [
        accuracy_score(y_val, y_pred_lr),
        accuracy_score(y_val, y_pred_dt),
        accuracy_score(y_val, y_pred_rf),
        accuracy_score(y_val, y_pred_xgb)
    ],
    'ROC-AUC': [
        roc_auc_score(y_val, y_proba_lr),
        roc_auc_score(y_val, y_proba_dt),
        roc_auc_score(y_val, y_proba_rf),
        roc_auc_score(y_val, y_proba_xgb)
    ]
}).sort_values('ROC-AUC', ascending=False)

print("\n MODEL COMPARISON")
print(results)

# Visualize
results.plot(x='Model', y=['Accuracy', 'ROC-AUC'], kind='bar', figsize=(10, 5))
plt.title('Model Performance Comparison')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# SELECT BEST MODEL 
best_model_name = results.iloc[0]['Model']
best_auc = results.iloc[0]['ROC-AUC']

#
if best_model_name == 'XGBoost':
    final_model = xgb_model
elif best_model_name == 'Random Forest':
    final_model = rf
else:
    final_model = lr

print(f"\n Best Model: {best_model_name} (AUC: {best_auc:.4f})")

#  TEST SET EVALUATION
y_test_pred = final_model.predict(X_test)
y_test_proba = final_model.predict_proba(X_test)[:, 1]

print("FINAL TEST SET PERFORMANCE")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_test_proba):.4f}")
print("\n" + classification_report(y_test, y_test_pred))

In [None]:
# SAVE MODEL 
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

with open('features.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

print("\n Model saved!")