In [4]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.metrics import precision_score, accuracy_score, recall_score, \
average_precision_score, precision_recall_curve, confusion_matrix
import seaborn as sns
from subprocess import call
from IPython.display import Image
import warnings
warnings.filterwarnings('ignore')

In [5]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from pandas.plotting import scatter_matrix

In [6]:
original_data = pd.read_csv('training_dataset.psv', sep ='|')
missing_data = original_data.isnull().sum()
missing_percent = (missing_data/original_data.shape[0])*100
refined_columns = list(missing_percent[missing_percent < 92].index)
sepsis_data = original_data[refined_columns]

In [7]:
refined_columns

['HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'FiO2',
 'Glucose',
 'Potassium',
 'Hct',
 'Age',
 'Gender',
 'Unit1',
 'Unit2',
 'HospAdmTime',
 'ICULOS',
 'SepsisLabel']

In [8]:
sepsis_data.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,FiO2,Glucose,Potassium,Hct,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,,,,,,,,,,,,83.14,0,,,-0.03,1,0
1,97.0,95.0,,98.0,75.33,,19.0,,,,,83.14,0,,,-0.03,2,0
2,89.0,99.0,,122.0,86.0,,22.0,,,,,83.14,0,,,-0.03,3,0
3,90.0,95.0,,,,,30.0,,,,,83.14,0,,,-0.03,4,0
4,103.0,88.5,,122.0,91.33,,24.5,0.28,,,,83.14,0,,,-0.03,5,0


In [9]:
sepsis_data.describe()


Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,FiO2,Glucose,Potassium,Hct,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
count,1231539.0,1185996.0,459082.0,1161992.0,1196538.0,901712.0,1163629.0,124902.0,222899.0,129727.0,126322.0,1361672.0,1361672.0,806409.0,806409.0,1361664.0,1361672.0,1361672.0
mean,84.63807,97.20677,36.981409,123.3876,81.87102,63.398324,18.73328,0.555864,136.638112,4.13913,30.767546,62.15126,0.5624284,0.498618,0.501382,-55.2279,27.0765,0.0183708
std,17.28354,2.941111,0.770547,23.02735,16.20716,13.85323,5.149755,11.320101,51.442553,0.639772,5.381143,16.35856,0.4960876,0.499998,0.499998,163.47,29.13055,0.1342882
min,20.0,20.0,20.9,20.0,20.0,20.0,1.0,-50.0,10.0,1.0,5.5,14.0,0.0,0.0,0.0,-5366.86,1.0,0.0
25%,72.0,96.0,36.5,106.5,70.5,54.0,15.0,0.4,106.0,3.7,27.1,52.0,0.0,0.0,0.0,-44.89,11.0,0.0
50%,83.5,98.0,37.0,121.0,80.0,62.0,18.0,0.5,126.0,4.1,30.25,64.0,1.0,0.0,1.0,-5.5,21.0,0.0
75%,96.0,99.5,37.5,138.0,91.0,71.0,22.0,0.6,153.0,4.4,34.0,74.34,1.0,1.0,1.0,-0.03,34.0,0.0
max,280.0,100.0,50.0,300.0,300.0,300.0,100.0,4000.0,988.0,27.5,71.7,100.0,1.0,1.0,1.0,23.99,336.0,1.0


In [10]:
sepsis_data.isnull().sum()
sepsis_data = sepsis_data.dropna(subset=['HospAdmTime'])

In [11]:
sepsis_data['O2Sat'].min()

20.0

In [12]:
sepsis_data_test = pd.read_csv('testing_dataset.psv', sep='|')

In [13]:
sepsis_data_test.describe()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
count,167343.0,163544.0,66161.0,164017.0,162466.0,163966.0,150313.0,15165.0,478.0,359.0,...,9988.0,1075.0,10149.0,190615.0,190615.0,133841.0,133841.0,190615.0,190615.0,190615.0
mean,84.166628,97.099233,36.948108,126.324189,86.299441,66.208423,18.675607,33.113848,-2.303556,23.9039,...,10.836399,284.346512,189.602572,61.004008,0.536595,0.484239,0.515761,-62.517086,26.411096,0.015219
std,17.625333,2.906194,0.765649,24.475479,16.795645,14.275852,4.680407,7.782494,4.336779,3.171466,...,9.168976,138.763003,95.043849,16.5487,0.49866,0.499753,0.499753,153.128765,28.084256,0.122424
min,21.0,20.0,30.4,22.0,30.0,20.0,1.0,10.0,-18.25,12.6,...,0.1,35.0,1.0,14.0,0.0,0.0,0.0,-3397.64,1.0,0.0
25%,71.5,96.0,36.5,108.0,74.0,56.0,16.0,29.0,-4.4,22.0,...,7.1,187.0,126.0,50.0,0.0,0.0,0.0,-66.67,10.0,0.0
50%,83.0,98.0,36.9,124.0,84.0,64.0,18.0,33.5,-1.975,23.8,...,9.6,260.0,178.0,62.0,1.0,0.0,1.0,-9.43,20.0,0.0
75%,95.0,99.0,37.4,142.0,96.0,74.0,21.0,38.0,0.3,25.55,...,12.9,348.0,237.0,73.0,1.0,1.0,1.0,-3.23,34.0,0.0
max,191.0,100.0,41.8,296.0,300.0,296.0,99.0,100.0,11.1,36.4,...,387.0,1000.0,1140.0,100.0,1.0,1.0,1.0,0.0,336.0,1.0


In [14]:
def fe_new_age(data):
    data.loc[data['Age'] >=60, 'new_age'] = 'old'
    data.loc[data['Age'] <10, 'new_age'] = 'infant'
    data.loc[(data['Age'] >=10) & (data['Age'] <60), 
            'new_age'] = 'adult'
    return data

In [15]:
sepsis_data = fe_new_age(sepsis_data)

In [16]:
sepsis_data_test = fe_new_age(sepsis_data_test)

In [17]:
sepsis_data['new_age'].isnull() 

0          False
1          False
2          False
3          False
4          False
           ...  
1361667    False
1361668    False
1361669    False
1361670    False
1361671    False
Name: new_age, Length: 1361664, dtype: bool

In [18]:
def fe_new_hr(data):
    data.loc[(data['HR'] >= 70) & (data['HR'] < 110 ) & (data['Age'] < 10), 'new_hr'] = 'normal'
    data.loc[(data['HR'] > 60) & (data['HR'] < 100) & data['Age'] >= 10, 'new_hr'] = 'normal'
    data.loc[((data['HR'] < 70) | (data['Age'] >= 110)) & (data['Age']<10), 'new_hr'] = 'abnormal'
    data.loc[(data['HR'] >= 100) & (data['Age'] >= 10), 'new_hr'] = 'abnormal'
    data['new_hr'].fillna('Missing', inplace=True)
    return data


In [19]:
sepsis_data = fe_new_hr(sepsis_data)
sepsis_data_test = fe_new_hr(sepsis_data_test)

In [20]:
def fe_new_o2sat(data):
    data.loc[(data['O2Sat'] >= 95) & (data['O2Sat'] < 100), 'new_o2sat'] = 'normal'
    data.loc[(data['O2Sat'] < 95) & (data['O2Sat'] >= 0), 'new_o2sat'] = 'abnormal'
    data['new_o2sat'].fillna('missing', inplace=True)
    return data

In [21]:
sepsis_data = fe_new_o2sat(sepsis_data)
sepsis_data_test = fe_new_o2sat(sepsis_data_test)

In [22]:
def fe_new_temp(data):
    data.loc[(data['Temp'] >= 36) & (data['Temp'] < 38),'new_temp'] = 'normal'
    data.loc[(data['Temp'] < 36) | (data['Temp'] >= 38),'new_temp'] = 'abnormal'
    data['new_temp'].fillna('Missing', inplace=True)
    return data

In [23]:
sepsis_data = fe_new_temp(sepsis_data)
sepsis_data_test = fe_new_temp(sepsis_data_test)

In [24]:
def fe_new_bp(data):
    data.loc[(data['SBP'] < 90) & (data['DBP'] < 60), 'new_bp'] = 'low'
    data.loc[(data['SBP'].between(90,120, inclusive=True)) & (data['DBP'].between(60,80, inclusive=True)), 'new_bp'] = 'normal'
    data.loc[(data['SBP'].between(120,140, inclusive=True)) & (data['DBP'].between(80,90, inclusive=True)),'new_bp'] = 'elevated'
    data.loc[(data['SBP'] > 140 ) & (data['DBP'] > 90 ), 'new_bp'] = 'high'
    data['new_bp'].fillna('Missing', inplace=True)
    return data

In [25]:
sepsis_data = fe_new_bp(sepsis_data)
sepsis_data_test = fe_new_bp(sepsis_data_test)

In [26]:
def fe_new_resp(data):
    data.loc[(data['Resp'].between(30, 60)) & (data['Age'] < 1), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(24, 40)) & (data['Age'].between(1, 3)), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(22, 34)) & (data['Age'].between(3, 6)), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(18, 30)) & (data['Age'].between(6, 12)), 'new_resp'] = 'normal'
    data.loc[(data['Resp'].between(12, 16)) & (data['Age'].between(12, 18)), 'new_resp'] = 'normal'  
    data.loc[(data['Resp'].between(12, 20)) & (data['Age'] > 18), 'new_resp'] = 'normal'  
    
    data.loc[((data['Resp'] < 30) | (data['Resp'] > 60)) & (data['Age'] <1) ,'new_resp'] = 'abnormal'   
    data.loc[((data['Resp'] < 24) | (data['Resp'] > 40)) & (data['Age'].between(1, 3)) ,'new_resp'] = 'abnormal'   
    data.loc[((data['Resp'] < 22) | (data['Resp'] > 34)) & (data['Age'].between(3, 6)) ,'new_resp'] = 'abnormal'
    data.loc[((data['Resp'] < 18) | (data['Resp'] > 30)) & (data['Age'].between(6, 12)) ,'new_resp'] = 'abnormal'
    data.loc[((data['Resp'] < 12) | (data['Resp'] > 16)) & (data['Age'].between(12, 18)) ,'new_resp'] = 'abnormal'
    data.loc[((data['Resp'] < 12) | (data['Resp'] > 20)) & (data['Age'] > 18) ,'new_resp'] = 'abnormal'
    
    
    data['new_resp'].fillna('missing', inplace = True)
                                                            
    return data

In [27]:
sepsis_data = fe_new_resp(sepsis_data)
sepsis_data_test = fe_new_resp(sepsis_data_test)

In [28]:
def fe_new_map(data):
    data.loc[(data['MAP'] >= 70) & (data['MAP'] < 100),'new_map'] = 'normal'
    data.loc[(data['MAP'] < 70) | (data['MAP'] >= 100),'new_map'] = 'abnormal'
    data['new_map'].fillna('Missing', inplace=True)
    return data

In [29]:
sepsis_data = fe_new_map(sepsis_data)
sepsis_data_test = fe_new_map(sepsis_data_test)

In [30]:
def fe_new_fio2(data):
    data.loc[(data['FiO2'] < 0.8 ) ,'new_fio2'] = 'normal'
    data.loc[(data['FiO2'] >= 0.8 ),'new_fio2'] = 'abnormal'
    data['new_fio2'].fillna('Missing', inplace=True)
    return data

In [31]:
sepsis_data = fe_new_fio2(sepsis_data)
sepsis_data_test = fe_new_fio2(sepsis_data_test)

In [32]:
sepsis_data.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,FiO2,Glucose,Potassium,...,ICULOS,SepsisLabel,new_age,new_hr,new_o2sat,new_temp,new_bp,new_resp,new_map,new_fio2
0,,,,,,,,,,,...,1,0,old,Missing,missing,Missing,Missing,missing,Missing,Missing
1,97.0,95.0,,98.0,75.33,,19.0,,,,...,2,0,old,Missing,normal,Missing,Missing,normal,normal,Missing
2,89.0,99.0,,122.0,86.0,,22.0,,,,...,3,0,old,Missing,normal,Missing,Missing,abnormal,normal,Missing
3,90.0,95.0,,,,,30.0,,,,...,4,0,old,Missing,normal,Missing,Missing,abnormal,Missing,Missing
4,103.0,88.5,,122.0,91.33,,24.5,0.28,,,...,5,0,old,abnormal,abnormal,Missing,Missing,abnormal,normal,normal


In [33]:
sepsis_data_test.head()

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,HCO3,...,ICULOS,SepsisLabel,new_age,new_hr,new_o2sat,new_temp,new_bp,new_resp,new_map,new_fio2
0,,,,,,,,,,,...,1,0,old,Missing,missing,Missing,Missing,missing,Missing,Missing
1,67.0,95.0,,165.5,106.0,76.0,,,,,...,2,0,old,Missing,normal,Missing,Missing,missing,abnormal,Missing
2,62.0,95.0,36.0,161.0,104.0,75.0,,,,,...,3,0,old,Missing,normal,normal,Missing,missing,abnormal,Missing
3,68.0,96.0,,155.5,101.5,73.5,,,,,...,4,0,old,Missing,normal,Missing,Missing,missing,abnormal,Missing
4,65.0,95.0,36.1,142.0,87.0,61.0,,,,,...,5,0,old,Missing,normal,normal,Missing,missing,normal,Missing


In [34]:
columns_new = ['Gender', 'new_age', 'new_o2sat', 'new_temp', 'new_bp', 'new_resp', 'new_map', 'new_fio2', 'new_hr', 'HospAdmTime', 'ICULOS']


In [35]:
target_col = ['SepsisLabel']

test_cols = columns_new + target_col


In [36]:
all_data_train = sepsis_data[test_cols]
all_data_test = sepsis_data_test[test_cols]

In [37]:
print(all_data_train.shape)
print(all_data_test.shape)

(1361664, 12)
(190615, 12)


In [38]:
all_data_train_df = all_data_train.to_csv ('fe_train.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path
all_data_test_df = all_data_test.to_csv ('fe_test.csv', index = None, header=True) #Don't forget to add '.csv' at the end of the path


In [39]:
all_data_test.head()

Unnamed: 0,Gender,new_age,new_o2sat,new_temp,new_bp,new_resp,new_map,new_fio2,new_hr,HospAdmTime,ICULOS,SepsisLabel
0,1,old,missing,Missing,Missing,missing,Missing,Missing,Missing,-43.69,1,0
1,1,old,normal,Missing,Missing,missing,abnormal,Missing,Missing,-43.69,2,0
2,1,old,normal,normal,Missing,missing,abnormal,Missing,Missing,-43.69,3,0
3,1,old,normal,Missing,Missing,missing,abnormal,Missing,Missing,-43.69,4,0
4,1,old,normal,normal,Missing,missing,normal,Missing,Missing,-43.69,5,0


In [40]:
X_train = sepsis_data[columns_new]
y_train = sepsis_data['SepsisLabel']

X_test = sepsis_data_test[columns_new]
y_test = sepsis_data_test['SepsisLabel']

In [41]:
X_train.head()

Unnamed: 0,Gender,new_age,new_o2sat,new_temp,new_bp,new_resp,new_map,new_fio2,new_hr,HospAdmTime,ICULOS
0,0,old,missing,Missing,Missing,missing,Missing,Missing,Missing,-0.03,1
1,0,old,normal,Missing,Missing,normal,normal,Missing,Missing,-0.03,2
2,0,old,normal,Missing,Missing,abnormal,normal,Missing,Missing,-0.03,3
3,0,old,normal,Missing,Missing,abnormal,Missing,Missing,Missing,-0.03,4
4,0,old,abnormal,Missing,Missing,abnormal,normal,normal,abnormal,-0.03,5


In [42]:
# Problem - Make sure all the items in the columns are oneHotencoded
#_ = scatter_matrix(X_train, figsize = [8,8], s=150, marker = 'D')

In [43]:
"""plt.figure()
sns.countplot(x = 'Gender', hue = 'SepsisLabel', data = sepsis_data)
plt.xticks([0,1], ['Female', 'Male'])
plt.show()"""

"plt.figure()\nsns.countplot(x = 'Gender', hue = 'SepsisLabel', data = sepsis_data)\nplt.xticks([0,1], ['Female', 'Male'])\nplt.show()"

In [44]:
sepsis_data['SepsisLabel'].value_counts()

0    1336649
1      25015
Name: SepsisLabel, dtype: int64

In [45]:
# Func 1
values = X_train.dtypes == object
values['Gender'] = True

In [46]:
# Func 2 - cat_pipeline
one_hot = OneHotEncoder( categories=None, drop=None, handle_unknown='ignore', sparse=True)
steps = [('onehotencoder', one_hot)]
one_hot_pipe = Pipeline(steps, memory=None, verbose=False)

one_hot_pipe

Pipeline(steps=[('onehotencoder',
                 OneHotEncoder(categories=None, handle_unknown='ignore'))])

In [47]:
# Func 3 - cont_scale_pipeline

steps_simp_scal =[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=np.nan, strategy='median',
                               verbose=0)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True))]
simp_scal_pipe = Pipeline(steps_simp_scal, memory=None, verbose=False)

In [48]:
# Func 4 - keep same

transform_col_pipe = make_column_transformer((simp_scal_pipe, 
                                                  ~values), 
                                                 (one_hot_pipe, values))
transform_col_pipe

ColumnTransformer(transformers=[('pipeline-1',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardscaler',
                                                  StandardScaler())]),
                                 Gender         False
new_age        False
new_o2sat      False
new_temp       False
new_bp         False
new_resp       False
new_map        False
new_fio2       False
new_hr         False
HospAdmTime     True
ICULOS          True
dtype: bool),
                                ('pipeline-2',
                                 Pipeline(steps=[('onehotencoder',
                                                  OneHotEncoder(categories=None,
                                                                handle_unknown='ignore'))]),
                                 Gender          True
new_age         True
new_o2sat       True


In [49]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1361664 entries, 0 to 1361671
Data columns (total 11 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Gender       1361664 non-null  int64  
 1   new_age      1361664 non-null  object 
 2   new_o2sat    1361664 non-null  object 
 3   new_temp     1361664 non-null  object 
 4   new_bp       1361664 non-null  object 
 5   new_resp     1361664 non-null  object 
 6   new_map      1361664 non-null  object 
 7   new_fio2     1361664 non-null  object 
 8   new_hr       1361664 non-null  object 
 9   HospAdmTime  1361664 non-null  float64
 10  ICULOS       1361664 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 124.7+ MB


In [51]:
svm_scale = make_pipeline(transform_col_pipe, SVC())

In [52]:
svm_scale

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  Gender         False
new_age        False
new_o2sat      False
new_temp       False
new_bp         False
new_resp       False
new_map        False
new_fio2       False
new_hr         False
HospAdmTime     True
ICULOS          True
dtype: bool),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(categories=None

In [53]:
parameteres = {'SVM__C':[0.001,0.1,10,100,10e5], 'SVM__gamma':[0.1,0.01]}

In [54]:
from sklearn.model_selection import GridSearchCV

In [55]:
grid = GridSearchCV(svm_scale, param_grid=parameteres, cv=5)

In [76]:
le=LabelEncoder()

In [77]:
for col in X_test.columns.values:
    # Encoding only categorical variables
    if X_test[col].dtypes=='object':
        # Using whole data to form an exhaustive list of levels
        data=X_train[col].append(X_test[col])
        le.fit(data.values)
        X_train[col]=le.transform(X_train[col])
        X_test[col]=le.transform(X_test[col])

In [78]:
X_train.head()

Unnamed: 0,Gender,new_age,new_o2sat,new_temp,new_bp,new_resp,new_map,new_fio2,new_hr,HospAdmTime,ICULOS
0,0,1,1,0,0,1,0,0,0,-0.03,1
1,0,1,2,0,0,2,2,0,0,-0.03,2
2,0,1,2,0,0,0,2,0,0,-0.03,3
3,0,1,2,0,0,0,0,0,0,-0.03,4
4,0,1,0,0,0,0,2,2,1,-0.03,5


In [79]:
X_train.tail()

Unnamed: 0,Gender,new_age,new_o2sat,new_temp,new_bp,new_resp,new_map,new_fio2,new_hr,HospAdmTime,ICULOS
1361667,0,1,1,0,0,2,2,0,0,-1.89,23
1361668,0,1,1,2,0,2,1,0,0,-1.89,24
1361669,0,1,2,0,0,2,1,0,0,-1.89,25
1361670,0,1,2,0,0,2,1,0,0,-1.89,26
1361671,0,1,2,0,4,2,2,0,0,-1.89,27


In [80]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190615 entries, 0 to 190614
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Gender       190615 non-null  int64  
 1   new_age      190615 non-null  int32  
 2   new_o2sat    190615 non-null  int32  
 3   new_temp     190615 non-null  int32  
 4   new_bp       190615 non-null  int32  
 5   new_resp     190615 non-null  int32  
 6   new_map      190615 non-null  int32  
 7   new_fio2     190615 non-null  int32  
 8   new_hr       190615 non-null  int32  
 9   HospAdmTime  190615 non-null  float64
 10  ICULOS       190615 non-null  int64  
dtypes: float64(1), int32(8), int64(2)
memory usage: 10.2 MB


In [81]:
log=LogisticRegression(penalty='l2',C=.01)

In [82]:
X_train.isnull().sum()

Gender         0
new_age        0
new_o2sat      0
new_temp       0
new_bp         0
new_resp       0
new_map        0
new_fio2       0
new_hr         0
HospAdmTime    0
ICULOS         0
dtype: int64

In [83]:
X_train = X_train.dropna(subset=['HospAdmTime'])

In [84]:
y_train.isnull().sum()

0

In [85]:
log.fit(X_train,y_train)

LogisticRegression(C=0.01)

In [86]:
accuracy_score(y_test,log.predict(X_test))

0.9848123180232405

In [87]:
knn=KNeighborsClassifier(n_neighbors=5)

In [88]:
knn.fit(X_train,y_train)

KNeighborsClassifier()

In [89]:
accuracy_score(y_test,knn.predict(X_test))

0.9802376518112426

In [90]:

dt = DecisionTreeClassifier()

In [91]:

dt.fit(X_train,y_train)

DecisionTreeClassifier()

In [92]:
accuracy_score(y_test,dt.predict(X_test))

0.9658159116543819

In [93]:

predicted_log = log.predict(X_test)
conf_matrix_log = confusion_matrix(y_test, predicted_log)

In [94]:

conf_matrix_log

array([[187713,      1],
       [  2894,      7]], dtype=int64)