# Split the data into training and testing

In [2]:
# Import the libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Load the data
path = r"..\EDA\engineered_COPD_data.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Age,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,BMI,Air_Pollution_Level,Respiratory_Infections_Childhood,COPD_Diagnosis,Pollution_Risk_Score,Smoking_Status_Encoded,...,Smoking_Pollution_Interaction,Location_Biratnagar,Location_Butwal,Location_Chitwan,Location_Dharan,Location_Hetauda,Location_Kathmandu,Location_Lalitpur,Location_Nepalgunj,Location_Pokhara
0,31,1,1,1,27.56,84,0,0,0,0.5,...,42.0,False,False,False,False,False,False,True,False,False
1,60,1,0,0,30.3,131,1,0,0,0.0,...,0.0,False,False,False,False,False,False,False,False,True
2,33,0,0,1,28.45,123,1,0,0,0.5,...,61.5,False,False,False,False,False,False,False,False,True
3,36,1,0,0,27.49,253,0,1,1,1.0,...,253.0,False,False,False,False,False,True,False,False,False
4,58,0,0,0,25.49,117,1,0,0,0.0,...,0.0,False,False,False,False,False,False,False,False,True


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               1000 non-null   int64  
 1   Biomass_Fuel_Exposure             1000 non-null   int64  
 2   Occupational_Exposure             1000 non-null   int64  
 3   Family_History_COPD               1000 non-null   int64  
 4   BMI                               1000 non-null   float64
 5   Air_Pollution_Level               1000 non-null   int64  
 6   Respiratory_Infections_Childhood  1000 non-null   int64  
 7   COPD_Diagnosis                    1000 non-null   int64  
 8   Pollution_Risk_Score              1000 non-null   int64  
 9   Smoking_Status_Encoded            1000 non-null   float64
 10  Gender_Encoded                    1000 non-null   int64  
 11  Smoking_Pollution_Interaction     1000 non-null   float64
 12  Locatio

In [17]:
# Define the features(all other categories) and the target variables(COPD_Diagnosis)
X = df.drop(columns=['COPD_Diagnosis']) #Features
y = df['COPD_Diagnosis'] #Target variable

In [18]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Test size 20% of the whole data

# Model Training
- Based on the data, we will be using the following models for predicting, as the target variable i.e. COPD_Diagnosis is a binary classification(someone either has COPD/1 or doesnt/0 )
    - Logistic Regression
    - Decision Trees
    - Random Forest

In [19]:
# Import the nescessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#library to save the models
import pickle

In [20]:
#Initialize the models
models = {
    'Logistic Regression' : LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

In [21]:
#Train the models
for name, model in models.items():
    model.fit(X_train, y_train)

    #Save the models
    with open(f'{name.replace(" ", "_")}.pkl', 'wb') as f:
        pickle.dump(model,f)
    print(f"{name} model trained and saved")

print("Model training Completed!")

Logistic Regression model trained and saved
Decision Tree model trained and saved
Random Forest model trained and saved
Model training Completed!


# Evaluate the models
- Accuracy, Precision, Recall, F1_score

In [22]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Evaluation: ")
    print(classification_report(y_test, y_pred))


Logistic Regression Evaluation: 
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       134
           1       0.95      0.94      0.95        66

    accuracy                           0.96       200
   macro avg       0.96      0.96      0.96       200
weighted avg       0.96      0.96      0.96       200


Decision Tree Evaluation: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200


Random Forest Evaluation: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00     

### Note:
As we can see that both **Decision Trees and Random Forest** has **1.00** socre for accuracy, precision and F1 Score, so both of them would be best models. But, even if the score is **1.00** there are few things to consider such as **Overfitting, Data Imbalance, Test Set Size, etc.** before concluding it being perfect so we choose any of the two for refinement. In this case, Random Forest.

# Model Refinement

### Random Forest Model Refinement

In [23]:
#Import the libraries
from sklearn.model_selection import GridSearchCV

In [30]:
#Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth' : [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

In [31]:
#Initialize the GridSearchCV
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid= param_grid, cv=5, n_jobs=-1, scoring='accuracy')

In [32]:
#Fit the Grid Search CV
grid_search_rf.fit(X_train, y_train)

In [33]:
#Best Parameters
print(f"Best Parameteres: {grid_search_rf.best_params_}")
best_model = grid_search_rf.best_estimator_

Best Parameteres: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}


In [34]:
#Save the model
with open('Best_Random_Forest_Model.pkl','wb') as f:
    pickle.dump(best_model, f)

print("Model refinement completed and best modelsaved!")

Model refinement completed and best modelsaved!
