In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier

In [2]:
df=pd.read_csv('novagen_dataset.csv')

In [3]:
df.head()

Unnamed: 0,Age,BMI,Blood_Pressure,Cholesterol,Glucose_Level,Heart_Rate,Sleep_Hours,Exercise_Hours,Water_Intake,Stress_Level,...,Diet,MentalHealth,PhysicalActivity,MedicalHistory,Allergies,Diet_Type__Vegan,Diet_Type__Vegetarian,Blood_Group_AB,Blood_Group_B,Blood_Group_O
0,2.0,26.0,111.0,198.0,99.0,72.0,4.0,1.0,5.0,5.0,...,1,2,1,0,1,False,True,True,False,False
1,8.0,24.0,121.0,199.0,103.0,75.0,2.0,1.0,2.0,9.0,...,1,2,1,2,2,False,False,True,False,False
2,81.0,27.0,147.0,203.0,100.0,74.0,10.0,-0.0,5.0,1.0,...,2,0,0,1,0,True,False,False,False,False
3,25.0,21.0,150.0,199.0,102.0,70.0,7.0,3.0,3.0,3.0,...,1,2,1,2,0,True,False,False,True,False
4,24.0,26.0,146.0,202.0,99.0,76.0,10.0,2.0,5.0,1.0,...,2,0,2,0,2,False,True,False,True,False


In [9]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 9549 entries, 0 to 9548
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    9549 non-null   float64
 1   BMI                    9549 non-null   float64
 2   Blood_Pressure         9549 non-null   float64
 3   Cholesterol            9549 non-null   float64
 4   Glucose_Level          9549 non-null   float64
 5   Heart_Rate             9549 non-null   float64
 6   Sleep_Hours            9549 non-null   float64
 7   Exercise_Hours         9549 non-null   float64
 8   Water_Intake           9549 non-null   float64
 9   Stress_Level           9549 non-null   float64
 10  Target                 9549 non-null   int64  
 11  Smoking                9549 non-null   int64  
 12  Alcohol                9549 non-null   int64  
 13  Diet                   9549 non-null   int64  
 14  MentalHealth           9549 non-null   int64  
 15  PhysicalActivit

In [10]:
df['Target']

0       1
1       1
2       0
3       0
4       0
       ..
9544    0
9545    0
9546    0
9547    0
9548    0
Name: Target, Length: 9549, dtype: int64

In [11]:
df.duplicated().sum()


0

In [12]:

# Split features and target
X = df.drop("Target", axis=1)
y = df["Target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [13]:
scaler=StandardScaler()
X_trained_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

model-1 LogisticRegression

In [23]:
lor=LogisticRegression(
    penalty='elasticnet',solver='saga',max_iter=5000)
lor.fit(X_trained_scaled,y_train)
y_pred=lor.predict(X_test_scaled)
# In Model Evaluation, Recall is more important than accuracy 
# because missing a high-risk patient is dangerous

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Logistic Regression Recall:", recall_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.8141361256544503
Logistic Regression Recall: 0.8283132530120482
              precision    recall  f1-score   support

           0       0.81      0.80      0.80       914
           1       0.82      0.83      0.82       996

    accuracy                           0.81      1910
   macro avg       0.81      0.81      0.81      1910
weighted avg       0.81      0.81      0.81      1910



model-2 KNN

In [26]:
knn=KNeighborsClassifier(
    n_neighbors=5,
    metric='euclidean'
)

knn.fit(X_trained_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNN Recall:", recall_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.8832460732984293
KNN Recall: 0.8835341365461847
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       914
           1       0.89      0.88      0.89       996

    accuracy                           0.88      1910
   macro avg       0.88      0.88      0.88      1910
weighted avg       0.88      0.88      0.88      1910



model-3 Radndom Forest

In [28]:
rf=RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Recall:", recall_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9382198952879581
Random Forest Recall: 0.9588353413654619
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       914
           1       0.93      0.96      0.94       996

    accuracy                           0.94      1910
   macro avg       0.94      0.94      0.94      1910
weighted avg       0.94      0.94      0.94      1910



model-4 Gradient Boosting

In [30]:
gb=GradientBoostingClassifier(n_estimators=500, learning_rate=0.5,
    max_depth=3, random_state=0)
gb.fit(X_trained_scaled,y_train)
y_pred=gb.predict(X_test_scaled)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred))
print("Gradient Boosting Recall:", recall_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Gradient Boosting Accuracy: 0.9476439790575916
Gradient Boosting Recall: 0.9518072289156626
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       914
           1       0.95      0.95      0.95       996

    accuracy                           0.95      1910
   macro avg       0.95      0.95      0.95      1910
weighted avg       0.95      0.95      0.95      1910



model-5 VotingClassifier

In [36]:
voting_clf = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(max_iter=1000, solver="liblinear")),
        ("knn", KNeighborsClassifier(n_neighbors=5)),
        ("rf", RandomForestClassifier(n_estimators=200 ,random_state=42))
    ],
    voting="soft"
)

voting_clf.fit(X_trained_scaled, y_train)

y_pred_vote = voting_clf.predict(X_test_scaled)

print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_vote))
print("Voting Classifier Recall:", recall_score(y_test, y_pred_vote))
print(classification_report(y_test, y_pred_vote))


Voting Classifier Accuracy: 0.9157068062827225
Voting Classifier Recall: 0.929718875502008
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       914
           1       0.91      0.93      0.92       996

    accuracy                           0.92      1910
   macro avg       0.92      0.92      0.92      1910
weighted avg       0.92      0.92      0.92      1910



model-6 Decision tree


In [50]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier(max_depth=None)
dt.fit(X_trained_scaled,y_train)
y_pred=dt.predict(X_test_scaled)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred))
print("Voting Classifier Recall:", recall_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Voting Classifier Accuracy: 0.8890052356020942
Voting Classifier Recall: 0.9036144578313253
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       914
           1       0.89      0.90      0.89       996

    accuracy                           0.89      1910
   macro avg       0.89      0.89      0.89      1910
weighted avg       0.89      0.89      0.89      1910



model-7 Bagging Classifier

In [51]:
from sklearn.ensemble import BaggingClassifier
bag=BaggingClassifier(
    estimator=dt,
    n_estimators=150
)
bag.fit(X_trained_scaled,y_train)
y_pred=bag.predict(X_test_scaled)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred))
print("Voting Classifier Recall:", recall_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Voting Classifier Accuracy: 0.9403141361256544
Voting Classifier Recall: 0.9608433734939759
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       914
           1       0.93      0.96      0.94       996

    accuracy                           0.94      1910
   macro avg       0.94      0.94      0.94      1910
weighted avg       0.94      0.94      0.94      1910



In [54]:
from sklearn.ensemble import BaggingClassifier
bag=BaggingClassifier(
    estimator=knn,
    n_estimators=400
)
bag.fit(X_trained_scaled,y_train)
y_pred=bag.predict(X_test_scaled)
print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred))
print("Voting Classifier Recall:", recall_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Voting Classifier Accuracy: 0.8942408376963351
Voting Classifier Recall: 0.8975903614457831
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       914
           1       0.90      0.90      0.90       996

    accuracy                           0.89      1910
   macro avg       0.89      0.89      0.89      1910
weighted avg       0.89      0.89      0.89      1910



In [59]:
data={
    "Model":[
        'LogisticRegression','KNN','RandomForest','Gradient Boosting','Voting Classifier',
        'Decision Tree','Bagging Classifier'
    ],
    "Recall":['82.8%','88.3%','95.8%','94.9%','93.07%','90.3%','96.08%']
}

In [60]:
df=pd.DataFrame(data)

In [62]:
df

Unnamed: 0,Model,Recall
0,LogisticRegression,82.8%
1,KNN,88.3%
2,RandomForest,95.8%
3,Gradient Boosting,94.9%
4,Voting Classifier,93.07%
5,Decision Tree,90.3%
6,Bagging Classifier,96.08%


Best Classifier that we should use for NovaGen(based on Recall) is- Bagging Classifier with base model Decision tree with accuracy