# Initial Steps

## Importing Libraries

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

### setting plot size

In [2]:
matplotlib.rcParams['figure.figsize'] = (15, 8)

## importing dataset

In [3]:
df = pd.read_csv("heart_2020_cleaned.csv")

In [4]:
df.head(7)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes


In [5]:
df.HeartDisease.value_counts()

No     292422
Yes     27373
Name: HeartDisease, dtype: int64

In [6]:
df.AgeCategory.value_counts()

65-69          34151
60-64          33686
70-74          31065
55-59          29757
50-54          25382
80 or older    24153
45-49          21791
75-79          21482
18-24          21064
40-44          21006
35-39          20550
30-34          18753
25-29          16955
Name: AgeCategory, dtype: int64

In [7]:
def condition(s): 
    x = 'Young'
    y = 'Adult'
    z = 'Old'
    if(s["AgeCategory"] == "18-24") | (s["AgeCategory"] == "25-29"):
        return x
    elif(s["AgeCategory"] == "30-34") | (s["AgeCategory"] == "35-39") | (s["AgeCategory"] == "40-44")| (s["AgeCategory"] == "45-49") \
        | (s["AgeCategory"] == "50-54")| (s["AgeCategory"] == "55-59"):

        return y
    else:
        return z

df['AgeCategory'] = df.apply(condition, axis=1)

In [8]:
pd.DataFrame(df.Diabetic.value_counts())

Unnamed: 0,Diabetic
No,269653
Yes,40802
"No, borderline diabetes",6781
Yes (during pregnancy),2559


In [9]:
df.Diabetic = df.Diabetic.replace(["No, borderline diabetes", "Yes (during pregnancy)"], ["No", "No"])

In [10]:
# df = df.drop("Race", axis=1)

# Modelling

In [11]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score

<a id="29"></a>

<font color='DeepSkyBlue'>


## PREPROCESSING

In [12]:
df["id"]=range(1,319796)
df=df.set_index("id")

In [13]:
df

Unnamed: 0_level_0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,No,16.60,Yes,No,No,3.0,30.0,No,Female,Adult,White,Yes,Yes,Very good,5.0,Yes,No,Yes
2,No,20.34,No,No,Yes,0.0,0.0,No,Female,Old,White,No,Yes,Very good,7.0,No,No,No
3,No,26.58,Yes,No,No,20.0,30.0,No,Male,Old,White,Yes,Yes,Fair,8.0,Yes,No,No
4,No,24.21,No,No,No,0.0,0.0,No,Female,Old,White,No,No,Good,6.0,No,No,Yes
5,No,23.71,No,No,No,28.0,0.0,Yes,Female,Adult,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319791,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,Old,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319792,No,29.84,Yes,No,No,0.0,0.0,No,Male,Adult,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319793,No,24.24,No,No,No,0.0,0.0,No,Female,Adult,Hispanic,No,Yes,Good,6.0,No,No,No
319794,No,32.81,No,No,No,0.0,0.0,No,Female,Young,Hispanic,No,No,Good,12.0,No,No,No


In [14]:
df_cat=df.select_dtypes(include=["object"])

In [15]:
df_categorical=df_cat.columns
df_categorical

Index(['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking',
       'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity',
       'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [16]:
for var in df_categorical:
    Dummy=pd.get_dummies(df[var], drop_first= True).add_prefix(var)
    df=df.merge(Dummy,on="id")
    df=df.drop([var],axis=1)

In [17]:
df.head(5)

Unnamed: 0_level_0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDiseaseYes,SmokingYes,AlcoholDrinkingYes,StrokeYes,DiffWalkingYes,SexMale,...,RaceWhite,DiabeticYes,PhysicalActivityYes,GenHealthFair,GenHealthGood,GenHealthPoor,GenHealthVery good,AsthmaYes,KidneyDiseaseYes,SkinCancerYes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,16.6,3.0,30.0,5.0,0,1,0,0,0,0,...,1,1,1,0,0,0,1,1,0,1
2,20.34,0.0,0.0,7.0,0,0,0,1,0,0,...,1,0,1,0,0,0,1,0,0,0
3,26.58,20.0,30.0,8.0,0,1,0,0,0,1,...,1,1,1,1,0,0,0,1,0,0
4,24.21,0.0,0.0,6.0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,1
5,23.71,28.0,0.0,8.0,0,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0


In [18]:
y=df.HeartDiseaseYes

In [19]:
(df.describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BMI,319795.0,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,319795.0,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,319795.0,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,319795.0,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0
HeartDiseaseYes,319795.0,0.085595,0.279766,0.0,0.0,0.0,0.0,1.0
SmokingYes,319795.0,0.412477,0.492281,0.0,0.0,0.0,1.0,1.0
AlcoholDrinkingYes,319795.0,0.068097,0.251912,0.0,0.0,0.0,0.0,1.0
StrokeYes,319795.0,0.03774,0.190567,0.0,0.0,0.0,0.0,1.0
DiffWalkingYes,319795.0,0.13887,0.345812,0.0,0.0,0.0,0.0,1.0
SexMale,319795.0,0.475273,0.499389,0.0,0.0,0.0,1.0,1.0


In [20]:
df['Severity'] = df["DiabeticYes"] + df['AsthmaYes'] + df['KidneyDiseaseYes'] + df["SkinCancerYes"]
df['Severity'] = df['Severity'].astype("object")

In [21]:
df = pd.get_dummies(df, drop_first=True)

In [22]:
df.head()

Unnamed: 0_level_0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDiseaseYes,SmokingYes,AlcoholDrinkingYes,StrokeYes,DiffWalkingYes,SexMale,...,GenHealthGood,GenHealthPoor,GenHealthVery good,AsthmaYes,KidneyDiseaseYes,SkinCancerYes,Severity_1,Severity_2,Severity_3,Severity_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,16.6,3.0,30.0,5.0,0,1,0,0,0,0,...,0,0,1,1,0,1,0,0,1,0
2,20.34,0.0,0.0,7.0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,26.58,20.0,30.0,8.0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
4,24.21,0.0,0.0,6.0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,0
5,23.71,28.0,0.0,8.0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [23]:
df = df.drop(["DiabeticYes",'AsthmaYes','KidneyDiseaseYes',"SkinCancerYes"], axis=1)

In [24]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BMI,319795.0,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,319795.0,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,319795.0,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,319795.0,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0
HeartDiseaseYes,319795.0,0.085595,0.279766,0.0,0.0,0.0,0.0,1.0
SmokingYes,319795.0,0.412477,0.492281,0.0,0.0,0.0,1.0,1.0
AlcoholDrinkingYes,319795.0,0.068097,0.251912,0.0,0.0,0.0,0.0,1.0
StrokeYes,319795.0,0.03774,0.190567,0.0,0.0,0.0,0.0,1.0
DiffWalkingYes,319795.0,0.13887,0.345812,0.0,0.0,0.0,0.0,1.0
SexMale,319795.0,0.475273,0.499389,0.0,0.0,0.0,1.0,1.0


In [25]:
X=df.drop(["HeartDiseaseYes"],axis=1)
y= df.HeartDiseaseYes

In [26]:
X.head(5)

Unnamed: 0_level_0,BMI,PhysicalHealth,MentalHealth,SleepTime,SmokingYes,AlcoholDrinkingYes,StrokeYes,DiffWalkingYes,SexMale,AgeCategoryOld,...,RaceWhite,PhysicalActivityYes,GenHealthFair,GenHealthGood,GenHealthPoor,GenHealthVery good,Severity_1,Severity_2,Severity_3,Severity_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,16.6,3.0,30.0,5.0,1,0,0,0,0,0,...,1,1,0,0,0,1,0,0,1,0
2,20.34,0.0,0.0,7.0,0,0,1,0,0,1,...,1,1,0,0,0,1,0,0,0,0
3,26.58,20.0,30.0,8.0,1,0,0,0,1,1,...,1,1,1,0,0,0,0,1,0,0
4,24.21,0.0,0.0,6.0,0,0,0,0,0,1,...,1,0,0,1,0,0,1,0,0,0
5,23.71,28.0,0.0,8.0,0,0,0,1,0,0,...,1,1,0,0,0,1,0,0,0,0


<a id="30"></a>

<font color='DeepSkyBlue'>


## NORMALIZATION

In [27]:
from sklearn.preprocessing import MinMaxScaler

In [28]:
# scaler=MinMaxScaler()
# columns=X.columns
# s=scaler.fit_transform(X)

# X_scaled_data=pd.DataFrame(s,columns=columns)
# X_scaled_data.head(5)

In [29]:
# X_scaled_data.columns

In [30]:
# X_scaled_data.columns

In [31]:
y.value_counts()

0    292422
1     27373
Name: HeartDiseaseYes, dtype: int64

In [32]:
X_train,X_test_,y_train,y_test_=train_test_split(X,
                                               y,
                                               test_size=0.10,
                                               random_state=1)

In [33]:
y_test_.value_counts()

0    29250
1     2730
Name: HeartDiseaseYes, dtype: int64

In [34]:
stratified_sample = df.groupby('HeartDiseaseYes', group_keys=False).apply(lambda x: x.sample(15000))
X_test = stratified_sample.drop("HeartDiseaseYes", axis=1)
y_test= stratified_sample.HeartDiseaseYes

In [35]:
y_test.value_counts()

0    15000
1    15000
Name: HeartDiseaseYes, dtype: int64

<a id="31"></a>

<font color='DeepSkyBlue'>


## OverSampling


In [36]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train_os, y_train_os = oversample.fit_resample(X_train, y_train)

In [39]:
y_train_os.value_counts()

0    263172
1    263172
Name: HeartDiseaseYes, dtype: int64

In [40]:
models=[DecisionTreeClassifier(),XGBClassifier(),RandomForestClassifier(), GaussianNB()]
scores=dict()

performance_table = ['Model', 'Accuracy', 'Precision', 'Percentage_mislabbled'
                     'Total_Ones', 'Mislabble_Ones', 'Percent_Mislabbled_Ones', 'Recall_Score', 'F1_Score']

Model = []
Accuracy = []
Precision = []
Percentage_mislabbled = []
Percent_Mislabbled_zeroes =[]
Percent_Mislabbled_Ones =[]
Recall_Score =[]
F1_Score = []


for m in models:
    m.fit(X_train_os,y_train_os)
    y_pred=m.predict(X_test)
    percent_mislabbled = (((y_test != y_pred).sum())/X_test.shape[0])*100
    

    y_pred_df = pd.DataFrame(np.array(y_pred), columns= ['ypred'])
    y_test_df = pd.DataFrame(y_test.values, columns = ['y_test'])
    df_test = pd.concat([y_pred_df, y_test_df], axis=1)
    df_test1 = df_test[df_test['y_test'] == 1]
    df_test0 = df_test[df_test['y_test'] == 0]
    percent_how_many_ones_mislabbled = ((df_test1.y_test != df_test1.ypred).sum()) / (len(df_test[df_test['y_test'] == 1])) * 100
    percent_how_many_zeroes_mislabbled = ((df_test0.y_test != df_test0.ypred).sum()) / (len(df_test[df_test['y_test'] == 0])) * 100
    
    Model.append(m)    
    Accuracy.append(accuracy_score(y_test,y_pred))
    Precision.append(precision_score(y_test,y_pred))
    Percentage_mislabbled.append(percent_mislabbled)
    Percent_Mislabbled_zeroes.append(percent_how_many_zeroes_mislabbled)    
    Percent_Mislabbled_Ones.append(percent_how_many_ones_mislabbled)
    Recall_Score.append(recall_score(y_test,y_pred))
    F1_Score.append(f1_score(y_test,y_pred))

performance_table = {'Model':Model , 'Accuracy':Accuracy, 'Precision': Precision, 'Percentage_mislabbled':Percentage_mislabbled, 'Percent_Mislabbled_zeroes': Percent_Mislabbled_zeroes,
                                            'Percent_Mislabbled_Ones': Percent_Mislabbled_Ones , 'Recall_Score': Recall_Score, 
                                            'F1_Score' :F1_Score}

performance_table = pd.DataFrame(performance_table)
performance_table

Unnamed: 0,Model,Accuracy,Precision,Percentage_mislabbled,Percent_Mislabbled_zeroes,Percent_Mislabbled_Ones,Recall_Score,F1_Score
0,DecisionTreeClassifier(),0.915333,0.976883,8.466667,2.013333,14.92,0.8508,0.909493
1,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.665633,0.770908,33.436667,14.006667,52.866667,0.471333,0.584999
2,"(DecisionTreeClassifier(max_features='sqrt', r...",0.9297,0.980613,7.03,1.733333,12.326667,0.876733,0.925768
3,GaussianNB(),0.629367,0.591167,37.063333,58.013333,16.113333,0.838867,0.693565


In [41]:
# performance_table.to_csv("oversample_performance.csv")

In [42]:
from sklearn.metrics import classification_report
rfc = RandomForestClassifier()
rfc.fit(X_train_os,y_train_os)
y_pred=rfc.predict(X_test)

In [43]:
pd.DataFrame(classification_report(y_test, y_pred, output_dict= True))

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.888628,0.980397,0.929667,0.934512,0.934512
recall,0.982467,0.876867,0.929667,0.929667,0.929667
f1-score,0.933194,0.925746,0.929667,0.92947,0.92947
support,15000.0,15000.0,0.929667,30000.0,30000.0


# Class Weightage

In [44]:
classweight = {0: 1, 1: 100}

In [45]:
from sklearn.tree import DecisionTreeClassifier
models=[DecisionTreeClassifier(class_weight=classweight), XGBClassifier(class_weight=classweight),RandomForestClassifier(class_weight=classweight)]
scores=dict()

performance_table = ['Model', 'Accuracy', 'Precision', 'Percentage_mislabbled'
                                            'Total_Ones', 'Mislabble_Ones', 'Percent_Mislabbled_Ones', 'Recall_Score', 'F1_Score']

Model = []
Accuracy = []
Precision = []
Percentage_mislabbled = []
Percent_Mislabbled_zeroes =[]
Percent_Mislabbled_Ones =[]
Recall_Score =[]
F1_Score = []


for m in models:
    m.fit(X_train,y_train)
    y_pred=m.predict(X_test)
    percent_mislabbled = (((y_test != y_pred).sum())/X_test.shape[0])*100
    

    y_pred_df = pd.DataFrame(np.array(y_pred), columns= ['ypred'])
    y_test_df = pd.DataFrame(y_test.values, columns = ['y_test'])
    df_test = pd.concat([y_pred_df, y_test_df], axis=1)
    df_test1 = df_test[df_test['y_test'] == 1]
    df_test0 = df_test[df_test['y_test'] == 0]
    percent_how_many_ones_mislabbled = ((df_test1.y_test != df_test1.ypred).sum()) / (len(df_test[df_test['y_test'] == 1])) * 100
    percent_how_many_zeroes_mislabbled = ((df_test0.y_test != df_test0.ypred).sum()) / (len(df_test[df_test['y_test'] == 0])) * 100
    
    

    Model.append(m)    
    Accuracy.append(accuracy_score(y_test,y_pred))
    Precision.append(precision_score(y_test,y_pred))
    Percentage_mislabbled.append(percent_mislabbled)
    Percent_Mislabbled_zeroes.append(percent_how_many_zeroes_mislabbled)    
    Percent_Mislabbled_Ones.append(percent_how_many_ones_mislabbled)
    Recall_Score.append(recall_score(y_test,y_pred))
    F1_Score.append(f1_score(y_test,y_pred))

performance_table = {'Model':Model , 'Accuracy':Accuracy, 'Precision': Precision, 'Percentage_mislabbled':Percentage_mislabbled, 'Percent_Mislabbled_zeroes': Percent_Mislabbled_zeroes,
                                            'Percent_Mislabbled_Ones': Percent_Mislabbled_Ones , 'Recall_Score': Recall_Score, 
                                            'F1_Score' :F1_Score}

performance_table = pd.DataFrame(performance_table)
performance_table

Parameters: { "class_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Unnamed: 0,Model,Accuracy,Precision,Percentage_mislabbled,Percent_Mislabbled_zeroes,Percent_Mislabbled_Ones,Recall_Score,F1_Score
0,"DecisionTreeClassifier(class_weight={0: 1, 1: ...",0.944433,0.963757,5.556667,3.473333,7.64,0.9236,0.943251
1,"XGBClassifier(base_score=0.5, booster='gbtree'...",0.5543,0.962521,44.57,0.44,88.7,0.113,0.202255
2,"(DecisionTreeClassifier(max_features='sqrt', r...",0.942467,0.967592,5.753333,3.066667,8.44,0.9156,0.940878


In [46]:
performance_table.to_csv("classweightage_performance.csv")

# SMOTETOMEK

In [47]:
from imblearn.combine import SMOTETomek
st = SMOTETomek(0.75)
X_train_st, y_train_st = st.fit_resample(X_train, y_train)

In [48]:
# models=[KNeighborsClassifier(),LogisticRegression(),XGBClassifier(),RandomForestClassifier(), GaussianNB()]
# scores=dict()

# performance_table = ['Model', 'Accuracy', 'Precision', 'Percentage_mislabbled'
#                                             'Total_Ones', 'Mislabble_Ones', 'Percent_Mislabbled_Ones', 'Recall_Score', 'F1_Score']

# Model = []
# Accuracy = []
# Precision = []
# Percentage_mislabbled = []
# Percent_Mislabbled_zeroes =[]
# Percent_Mislabbled_Ones =[]
# Recall_Score =[]
# F1_Score = []


# for m in models:
#     m.fit(X_train_st,y_train_st)
#     y_pred=m.predict(X_test)
#     percent_mislabbled = (((y_test != y_pred).sum())/X_test.shape[0])*100
    

#     y_pred_df = pd.DataFrame(np.array(y_pred), columns= ['ypred'])
#     y_test_df = pd.DataFrame(y_test.values, columns = ['y_test'])
#     df_test = pd.concat([y_pred_df, y_test_df], axis=1)
#     df_test1 = df_test[df_test['y_test'] == 1]
#     df_test0 = df_test[df_test['y_test'] == 0]
#     percent_how_many_ones_mislabbled = ((df_test1.y_test != df_test1.ypred).sum()) / (len(df_test[df_test['y_test'] == 1])) * 100
#     percent_how_many_zeroes_mislabbled = ((df_test0.y_test != df_test0.ypred).sum()) / (len(df_test[df_test['y_test'] == 0])) * 100
    
    

#     Model.append(m)    
#     Accuracy.append(accuracy_score(y_test,y_pred))
#     Precision.append(precision_score(y_test,y_pred))
#     Percentage_mislabbled.append(percent_mislabbled)
#     Percent_Mislabbled_zeroes.append(percent_how_many_zeroes_mislabbled)    
#     Percent_Mislabbled_Ones.append(percent_how_many_ones_mislabbled)
#     Recall_Score.append(recall_score(y_test,y_pred))
#     F1_Score.append(f1_score(y_test,y_pred))

# performance_table = {'Model':Model , 'Accuracy':Accuracy, 'Precision': Precision, 'Percentage_mislabbled':Percentage_mislabbled, 
#                      'Percent_Mislabbled_zeroes': Percent_Mislabbled_zeroes,
#                                             'Percent_Mislabbled_Ones': Percent_Mislabbled_Ones , 'Recall_Score': Recall_Score, 
#                                             'F1_Score' :F1_Score}

# performance_table = pd.DataFrame(performance_table)
# performance_table

In [49]:
rfc = RandomForestClassifier()
rfc.fit(X_train_st,y_train_st)
y_pred=rfc.predict(X_test)
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.88      0.97      0.92     15000\n           1       0.97      0.86      0.91     15000\n\n    accuracy                           0.92     30000\n   macro avg       0.92      0.92      0.92     30000\nweighted avg       0.92      0.92      0.92     30000\n'