In [7]:
import json
import pandas as pd 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib

In [8]:
x_features=pd.read_csv('../processed/x_features.csv')
with open("../processed/feature_selection.json", "r") as f:
    feature_names = json.load(f)

x=x_features[feature_names]   
y=pd.read_csv("../processed/target.csv").values.ravel()


X_train, X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

 Hyperparameter Tuning by using GridSearchCV

In [9]:
#to enhanced performance of the models 

models={
    'SVC': {
        'pipeline': make_pipeline(MinMaxScaler(), SVC()),
        'params':{
            'svc__C': [0.01, 0.1,1,10],  #svc__ : for telling grid search this is a 'svc' parameter not 'minmax' param
            'svc__kernel':['linear','rbf'],
            'svc__gamma':['scale','auto']
        }
    },
    
    'Random_forest':{
        'pipeline':RandomForestClassifier(),
        'params':{
            'n_estimators': [50, 100],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]

        }
    },
    
    'logistic_regression':{
        'pipeline': make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=1000)),
        'params':{
            'logisticregression__C': [0.01, 0.1, 1, 10],
            'logisticregression__penalty': ['l2'],
            'logisticregression__solver': ['lbfgs', 'liblinear']
        }
    },
    
    'Decision_tree':{
        'pipeline':DecisionTreeClassifier(),
        'params':{
            'max_depth': [3,5,10,None],
            'criterion': ['gini', 'entropy']
        }
    }
}


compare between baseline accuracy and tuned accuracy 

In [10]:
results=[]

for model_name, model_param in models.items():
    
    base_model=model_param['pipeline']
    base_model.fit(X_train,y_train)
    y_pred_base=base_model.predict(X_test)
    base_acc=accuracy_score(y_test,y_pred_base)
    
    
    gs=GridSearchCV(model_param['pipeline'],model_param['params'],cv=5,scoring='accuracy',n_jobs=-1)
    gs.fit(X_train,y_train)
    best_model=gs.best_estimator_
    y_pred_tuned=best_model.predict(X_test)
    tuned_acc=accuracy_score(y_test,y_pred_tuned)
    
    results.append({
        'Model':model_name,
        'Baseline Accuracy':base_acc,
        'Tuned Accuracy':tuned_acc,
        'Best Parameters':gs.best_params_
    })
    
    
res_df=pd.DataFrame(results)
res_df

Unnamed: 0,Model,Baseline Accuracy,Tuned Accuracy,Best Parameters
0,SVC,0.824561,0.912281,"{'svc__C': 1, 'svc__gamma': 'scale', 'svc__ker..."
1,Random_forest,0.842105,0.859649,"{'max_depth': None, 'min_samples_split': 5, 'n..."
2,logistic_regression,0.912281,0.877193,"{'logisticregression__C': 0.1, 'logisticregres..."
3,Decision_tree,0.789474,0.824561,"{'criterion': 'gini', 'max_depth': 5}"


save best model 

In [11]:
joblib.dump(best_model,'../models/final_model.pkl')

['../models/final_model.pkl']

In [12]:

# Load the saved model
model = joblib.load('../models/final_model.pkl')
print(type(model))


<class 'sklearn.tree._classes.DecisionTreeClassifier'>
