# Import Libraries

In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split
from  sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC ,LinearSVC
from sklearn.ensemble import  RandomForestClassifier ,AdaBoostClassifier ,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import  GaussianNB
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.metrics import classification_report ,accuracy_score , f1_score ,precision_score , recall_score
import warnings
warnings.filterwarnings('ignore')

In [41]:
DATA_PATH       = '../data/clean_data.pkl'
Clean_Data      = '../data/prep_data.pkl'
x_train_path    = '../data/x_train.pkl'
x_test_path     = '../data/x_test.pkl'
y_train_path    = '../data/y_train.pkl'
y_test_path     = '../data/y_test.pkl'

In [61]:
# READ DATA
df = pd.read_pickle(Clean_Data)
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
298,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
299,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
300,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       276 non-null    int64   
 1   sex       276 non-null    category
 2   cp        276 non-null    category
 3   trtbps    276 non-null    int64   
 4   chol      276 non-null    int64   
 5   fbs       276 non-null    category
 6   restecg   276 non-null    category
 7   thalachh  276 non-null    int64   
 8   exng      276 non-null    category
 9   oldpeak   276 non-null    float64 
 10  slp       276 non-null    category
 11  caa       276 non-null    category
 12  thall     276 non-null    category
 13  output    276 non-null    int64   
dtypes: category(8), float64(1), int64(5)
memory usage: 18.5 KB


# Train & EVAL Model

In [57]:
def train_eval(models):
    '''
    Function To Train Data on a List of Models
    return:
    List contains measure of multi accuracy method of every model in the list
    '''
    models_acc = []
    # Load Spliting DATA
    x_train = pd.read_pickle(x_train_path)
    x_test  = pd.read_pickle(x_test_path)
    y_train = pd.read_pickle(y_train_path)
    y_test  = pd.read_pickle(y_test_path)
    for model in models:
        model.fit(x_train,y_train)
        y_h = model.predict(x_test)
        per = precision_score(y_test,y_h)
        rec = recall_score(y_test,y_h)
        f1 = f1_score(y_test,y_h)
        acc = accuracy_score(y_test,y_h)
        models_acc.append([model,acc,f1,per,rec])
    del x_train ,x_test ,y_train ,y_test ,per , y_h ,acc ,f1,rec
    return models_acc

In [58]:
# Create Model Instances
lr = LogisticRegression(random_state=42)
svm = SVC(random_state=42)
rf  = RandomForestClassifier(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
gus = GaussianNB()
knn = KNeighborsClassifier()
lin_svc = LinearSVC(random_state=42)
ada = AdaBoostClassifier(random_state=42)
grd  = GradientBoostingClassifier(random_state=42)

# Create List Of All  Instances
models = [lr,svm,rf,dt,gus,knn,lin_svc,ada,grd]

# TRAIN & EVAL
df_acc = train_eval(models)

# PRINT MODELS ACCURACY
pd.DataFrame(df_acc,columns=['name','acc','f1','perscion','recall'])

Unnamed: 0,name,acc,f1,perscion,recall
0,LogisticRegression(random_state=42),0.891566,0.912621,0.886792,0.94
1,SVC(random_state=42),0.746988,0.810811,0.737705,0.9
2,"(DecisionTreeClassifier(max_features='auto', r...",0.843373,0.865979,0.893617,0.84
3,DecisionTreeClassifier(random_state=42),0.831325,0.865385,0.833333,0.9
4,GaussianNB(),0.86747,0.888889,0.897959,0.88
5,KNeighborsClassifier(),0.638554,0.693878,0.708333,0.68
6,LinearSVC(random_state=42),0.819277,0.831461,0.948718,0.74
7,"(DecisionTreeClassifier(max_depth=1, random_st...",0.795181,0.831683,0.823529,0.84
8,([DecisionTreeRegressor(criterion='friedman_ms...,0.807229,0.843137,0.826923,0.86


In [56]:
# Load Spliting DATA
x_train = pd.read_pickle(x_train_path)
x_test  = pd.read_pickle(x_test_path)
y_train = pd.read_pickle(y_train_path)
y_test  = pd.read_pickle(y_test_path)

# Create Model
model  = LogisticRegression(random_state=42)

# Train Model
model.fit(x_train,y_train)

# EVAL Model
y_h = model.predict(x_test)
print(classification_report(y_test,y_h))
print(f1_score(y_test,y_h))
print(precision_score(y_test,y_h))
print(recall_score(y_test,y_h))
print(accuracy_score(y_test,y_h))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86        33
           1       0.89      0.94      0.91        50

    accuracy                           0.89        83
   macro avg       0.89      0.88      0.88        83
weighted avg       0.89      0.89      0.89        83

0.912621359223301
0.8867924528301887
0.94
0.891566265060241


In [37]:
model.score(x_test,y_test)

0.891566265060241

# Save Model

In [55]:
# model_path = '../models/89_acc.sav'
# pickle.dump(model , open(model_path,'wb'))

# Fine Tuning Hyperparameters

In [92]:
params = {
    'penalty':['l2','l1','elasticnet','none'],
    'dual':[True,False],
    'tol':[0.0001,0.01,1.0,50.0],
    'C':[1.0,0.1,2.0,5.0,0.001],
    'fit_intercept':[True,False],
    'solver':['newton-cg','lbfgs','liblinear','sag','saga'],
    'max_iter':[1000,100,1000000],
    'multi_class':['auto', 'ovr', 'multinomial'],
    'warm_start':[True,False],
}
lr = LogisticRegression()

In [93]:
x   = df.drop('output',axis=1).copy()
y   = df['output'].copy()
clf = GridSearchCV(lr,params,cv=3)
clf.fit(x,y)

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 0.1, 2.0, 5.0, 0.001],
                         'dual': [True, False], 'fit_intercept': [True, False],
                         'max_iter': [1000, 100, 1000000],
                         'multi_class': ['auto', 'ovr', 'multinomial'],
                         'penalty': ['l2', 'l1', 'elasticnet', 'none'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga'],
                         'tol': [0.0001, 0.01, 1.0, 50.0],
                         'warm_start': [True, False]})

In [95]:
# print(sorted(clf.cv_results_.keys()))

In [94]:
print(clf.best_score_)
print()
print(clf.best_params_)

0.8442028985507246

{'C': 0.1, 'dual': True, 'fit_intercept': False, 'max_iter': 1000000, 'multi_class': 'auto', 'penalty': 'l2', 'solver': 'liblinear', 'tol': 1.0, 'warm_start': False}


# Feature Selection

In [66]:
def get_best_cols(model,x,y):
    model = LogisticRegression()
    RF = RFECV(model,step=1,cv=2,n_jobs=-1)
    RF.fit(x,y)
    best_cols = RF.support_
    # print(RF.support_)
    # print('------------------------------------------')
    # print(RF.ranking_)
    # print('------------------------------------------')
    # print(x.columns[best_cols])
    # print('------------------------------------------')
    print('No . best Columns',len(x.columns[best_cols]))
    # print('------------------------------------------')
    # print(x.columns[best_cols])
    # print('------------------------------------------')
    best_cols_name = x.columns[best_cols]
    x = x[best_cols_name]
    x_train ,x_test , y_train , y_test =train_test_split(x,y,test_size=.3,random_state=42)
    model.fit(x_train,y_train)
    y_h = model.predict(x_test)
    print(classification_report(y_test,y_h))
    print(precision_score(y_test,y_h))
    print(f1_score(y_test,y_h))
    print(accuracy_score(y_test,y_h))

In [67]:
x = df.drop('output',axis=1).copy()
y = df['output'].copy()
lr = LogisticRegression(random_state=42)
get_best_cols(lr,x,y)

------------------------------------------
------------------------------------------
------------------------------------------
12
------------------------------------------
------------------------------------------
              precision    recall  f1-score   support

           0       0.90      0.82      0.86        33
           1       0.89      0.94      0.91        50

    accuracy                           0.89        83
   macro avg       0.89      0.88      0.88        83
weighted avg       0.89      0.89      0.89        83

0.8867924528301887
0.912621359223301
0.891566265060241
