# Model Training

In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
import xgboost 
from sklearn.metrics import roc_auc_score
from colorama import Fore
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from dotenv import load_dotenv
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost 

# Loading Data

In [2]:
# load_dotenv()

# secure_bundle = os.getenv('secure_bundle')
# client_id = os.getenv('client_id')
# client_secret = os.getenv('client_secret')

# cloud_config= {
#         'secure_connect_bundle': secure_bundle  
# }

# auth_provider = PlainTextAuthProvider(client_id, client_secret)
# cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
# session = cluster.connect()

# train_df = pd.DataFrame(list(session.execute('SELECT * FROM stroke.data;')))
# train_df.drop(columns=["id"],inplace=True)
# train_df.tail(1)

In [3]:
train_df = pd.read_csv(r"D:\jypyter notebooks\kaggle\datastax\data\raw_data.csv")
train_df.drop(columns=['id'],inplace=True)

In [4]:
def change_dtype(df,col):
    df[col] = df[col].astype(float)

In [5]:
change_dtype(train_df,"cond")
change_dtype(train_df,"calc")
change_dtype(train_df,"gravity")
change_dtype(train_df,"ph")
change_dtype(train_df,"ph")

In [6]:
train_df.dtypes

gravity    float64
ph         float64
osmo         int64
cond       float64
urea         int64
calc       float64
target       int64
dtype: object

# Model Training

In [8]:
X = train_df.drop(columns=["target"])
X.head()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc
0,1.013,6.19,443,14.8,124,1.45
1,1.025,5.4,703,23.6,394,4.18
2,1.009,6.13,371,24.5,159,9.04
3,1.021,4.91,442,20.8,398,6.63
4,1.021,5.53,874,17.8,385,2.21


In [9]:
from scipy.stats.mstats import winsorize

outlier_columns = ["gravity", "ph", "calc"]
for col in outlier_columns:
    X.loc[:,col] = winsorize(X[col], (0.05, 0.05))

In [10]:
y=  train_df['target']
y

0      0
1      0
2      0
3      1
4      1
      ..
409    0
410    0
411    1
412    1
413    0
Name: target, Length: 414, dtype: int64

In [11]:
data_pipline = Pipeline([
    ('scaler',StandardScaler())
])

X = data_pipline.fit_transform(X)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((331, 6), (83, 6))

In [13]:
def evaluate_model(true, predict_proba,predicted):
    roc = roc_auc_score(true,predict_proba)
    precision = precision_score(true,predicted)
    recall = recall_score(true,predicted)
    f1 = f1_score(true,predicted)
    return roc, precision, recall, f1

In [14]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVC": SVC(probability=True),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "xgboost Classifier": xgboost.XGBClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}
model_list = []
test_roc_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_train_pred_proba = model.predict_proba(X_train)[:,1]
    y_test_pred = model.predict(X_test)
    y_test_pred_proba = model.predict_proba(X_test)[:,1]
    
    # Evaluate Train and Test dataset
    train_roc, train_precision, train_recall, train_f1 = evaluate_model(y_train, y_train_pred_proba,y_train_pred)

    test_roc, test_precision, test_recall, test_f1 = evaluate_model(y_test, y_test_pred_proba,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Train Precision Score: {:.4f}".format(train_precision))
    print("- Train Recall Score: {:.4f}".format(train_recall))
    print("- Train f1 Score: {:.4f}".format(train_f1))
    print("- Train Roc auc Score: {:.4f}".format(train_roc))

    print('----------------------------------')

    print('Model performance for test set')
    print("- Test Precision Score: {:.4f}".format(test_precision))
    print("- Test Recall Score: {:.4f}".format(test_recall))
    print("- Test f1 Score: {:.4f}".format(test_f1))
    print("- Test Roc auc Score: {:.4f}".format(test_roc))

    test_roc_list.append(test_roc)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
- Train Precision Score: 0.7565
- Train Recall Score: 0.5959
- Train f1 Score: 0.6667
- Train Roc auc Score: 0.7930
----------------------------------
Model performance for test set
- Test Precision Score: 0.7714
- Test Recall Score: 0.7105
- Test f1 Score: 0.7397
- Test Roc auc Score: 0.8269


SVC
Model performance for Training set
- Train Precision Score: 0.7983
- Train Recall Score: 0.6507
- Train f1 Score: 0.7170
- Train Roc auc Score: 0.8484
----------------------------------
Model performance for test set
- Test Precision Score: 0.7667
- Test Recall Score: 0.6053
- Test f1 Score: 0.6765
- Test Roc auc Score: 0.8322




Random Forest Classifier
Model performance for Training set
- Train Precision Score: 1.0000
- Train Recall Score: 1.0000
- Train f1 Score: 1.0000
- Train Roc auc Score: 1.0000
----------------------------------
Model performance for test set
- Test Precision Score: 0.7353
- Test Recall Score: 0.6579
- Test f1 Score: 0.6944
- Test Roc auc Score: 0.8164


AdaBoost Classifier
Model performance for Training set
- Train Precision Score: 0.7820
- Train Recall Score: 0.7123
- Train f1 Score: 0.7455
- Train Roc auc Score: 0.8929
----------------------------------
Model performance for test set
- Test Precision Score: 0.6923
- Test Recall Score: 0.7105
- Test f1 Score: 0.7013
- Test Roc auc Score: 0.8000


xgboost Classifier
Model performance for Training set
- Train Precision Score: 1.0000
- Train Recall Score: 1.0000
- Train f1 Score: 1.0000
- Train Roc auc Score: 1.0000
----------------------------------
Model performance for test set
- Test Precision Score: 0.6757
- Test Recall Score: 0.657

In [15]:
pd.DataFrame(list(zip(model_list, test_roc_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
1,SVC,0.832164
0,Logistic Regression,0.826901
2,Random Forest Classifier,0.816374
3,AdaBoost Classifier,0.8
4,xgboost Classifier,0.757895
5,Decision Tree,0.54386


In [46]:
svc = SVC(probability=True)
svc.fit(X_train, y_train)
y_pred = svc.predict_proba(X_test)[:,1]
score = roc_auc_score(y_test,y_pred)
print(" ROC Score of the model is %.4f" %score)

 ROC Score of the model is 0.8322


In [54]:
pred = svc.predict(X_test)
print("classification report : \n",classification_report(y_test,pred))

classification report : 
               precision    recall  f1-score   support

           0       0.72      0.84      0.78        45
           1       0.77      0.61      0.68        38

    accuracy                           0.73        83
   macro avg       0.74      0.72      0.73        83
weighted avg       0.74      0.73      0.73        83

