In [1]:
# Data Management
import pandas as pd
import numpy as np

# Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid")


from sklearn.model_selection import train_test_split

# Hyperparameter selection
from sklearn.model_selection import GridSearchCV

# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2, RFE

# Modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Model Evaluation
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix, roc_auc_score
import itertools


# Cleaning

In [2]:
data = pd.read_csv('Surgery_Data.csv')

In [3]:
def data_cleaning(data):

    # converting into binary variables
    data[['Risk1Yr', 'PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32']] = \
    (data[[ 'Risk1Yr', 'PRE7', 'PRE8', 'PRE9', 'PRE10', 'PRE11', 'PRE17', 'PRE19', 'PRE25', 'PRE30', 'PRE32']] == 'T').astype(int)

    # converting into categorical variables
    data['DGN'] = data.DGN.str[-1:].astype(int)
    data['PRE6'] = data.PRE6.str[-1:].astype(int)
    data['PRE14'] = data.PRE14.str[-1:].astype(int)

    # renaming data variables
    col_names = {'Risk1Yr': 'Death_1yr', 'DGN': 'Diagnosis', 'PRE4': 'FVC', 'PRE5': 'FEV1', 'PRE6': 'Performance', 
                'PRE7': 'Pain', 'PRE8': 'Haemoptysis', 'PRE9': 'Dyspnoea', 'PRE10': 'Cough', 'PRE11': 'Weakness',
                'PRE14': 'Tumor_Size', 'PRE17': 'Diabetes_Mellitus', 'PRE19': 'MI_6mo', 'PRE25': 'PAD',
                'PRE30': 'Smoking', 'PRE32': 'Asthma', 'AGE': 'Age'}
    data = data.rename(index=str, columns=col_names)

    df = data[data.Age > 30]
    df = data[data.FEV1 < 8]

    print('The original data contains \033[1m' + str(len(data)) + '\033[0m observations, while after the applied changes there are \033[1m' + str(len(df)) + '\033[0m left.')
    
    return df

In [4]:
data = data_cleaning(data)

The original data contains [1m471[0m observations, while after the applied changes there are [1m456[0m left.


In [5]:
data.head()

Unnamed: 0,Death_1yr,Diagnosis,FVC,FEV1,Performance,Pain,Haemoptysis,Dyspnoea,Cough,Weakness,Tumor_Size,Diabetes_Mellitus,MI_6mo,PAD,Smoking,Asthma,Age
0,0,2,2.88,2.16,1,0,0,0,1,1,4,0,0,0,1,0,60
1,0,3,3.4,1.88,0,0,0,0,0,0,2,0,0,0,1,0,51
2,0,3,2.76,2.08,1,0,0,0,1,0,1,0,0,0,1,0,59
3,0,3,3.68,3.04,0,0,0,0,0,0,1,0,0,0,0,0,54
4,1,3,2.44,0.96,2,0,1,0,1,1,1,0,0,0,1,0,73


# X, y, train_test_split

In [6]:
X, y = data.drop(['Death_1yr', 'MI_6mo', 'Asthma'], axis=1), data['Death_1yr']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# Initial models to be used

In [7]:
lr = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()

# Hyperparameters

In [8]:
def hyperpars(model, parameters):
    
    param_search = GridSearchCV(model, parameters)
    param_search.fit(X_train, y_train)
    
    return param_search.best_estimator_

**Hyperparameters for LogisticRegression()**

In [9]:
def logistic(model):
    
    reg = [0.1, 1, 10]
    weight = ['balanced', None]
    
    param_lr = dict(C=reg, class_weight=weight)
    lr_hyper = hyperpars(model, param_lr)
    
    print('\033[1m Selected hyperparameters for Logistic Regression are: \033[0m \n\n', lr_hyper)
    
    return lr_hyper

In [10]:
lr_hyper = logistic(lr)

[1m Selected hyperparameters for Logistic Regression are: [0m 

 LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


**Hyperparameters for DecisionTreeClassifier()**

In [11]:
def tree_forest(model):
    
    weight = ['balanced', None]
    criterion = ['gini', 'entropy']
    depth = [i for i in range(5,21,1)]
    samples = [i for i in range(20,500,10)]
    
    param_dt = dict(class_weight=weight, criterion=criterion, max_depth=depth, min_samples_leaf=samples)
    hyper = hyperpars(model, param_dt)
    
    print('\033[1m Selected hyperparameters for Logistic Regression are: \033[0m \n\n', hyper)
    
    return hyper

In [12]:
dt_hyper = tree_forest(dt)

[1m Selected hyperparameters for Logistic Regression are: [0m 

 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [13]:
rf_hyper = tree_forest(rf)

[1m Selected hyperparameters for Logistic Regression are: [0m 

 RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=100,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


# Feature Importance

In [14]:
def feature_selection(model):
    
    model.fit(X_train, y_train)
    
    if model==lr:
        print('\033[1m Default Logistic Regression \033[0m')
        relative_importances = pd.DataFrame(index=list(X), data=model.coef_[0], columns=["importance"])
        
    if model==lr_hyper:
        print('\033[1m Modified Logistic Regression \033[0m')
        relative_importances = pd.DataFrame(index=list(X), data=model.coef_[0], columns=["importance"])
        
    if model==dt:
        print('\033[1m Default Decision Tree \033[0m')
        relative_importances = pd.DataFrame(index=list(X), data=model.feature_importances_, columns=["importance"])
    
    if model==dt_hyper:
        print('\033[1m Modified Decision Tree \033[0m')
        relative_importances = pd.DataFrame(index=list(X), data=model.feature_importances_, columns=["importance"])
        
    if model==rf:
        print('\033[1m Default Random Forest \033[0m')
        relative_importances = pd.DataFrame(index=list(X), data=model.feature_importances_, columns=["importance"])
    
    if model==rf_hyper:
        print('\033[1m Modified Random Forest \033[0m')
        relative_importances = pd.DataFrame(index=list(X), data=model.feature_importances_, columns=["importance"])
    
    return relative_importances[relative_importances.importance>0.01]

In [15]:
feature_selection(lr)

[1m Default Logistic Regression [0m


Unnamed: 0,importance
Diagnosis,0.304212
FVC,0.082966
Performance,0.157403
Pain,0.466141
Haemoptysis,0.163557
Dyspnoea,0.655172
Cough,0.133426
Weakness,0.371484
Tumor_Size,0.527953
Diabetes_Mellitus,0.989599


In [16]:
feature_selection(lr_hyper)

[1m Modified Logistic Regression [0m


Unnamed: 0,importance
Diagnosis,0.304212
FVC,0.082966
Performance,0.157403
Pain,0.466141
Haemoptysis,0.163557
Dyspnoea,0.655172
Cough,0.133426
Weakness,0.371484
Tumor_Size,0.527953
Diabetes_Mellitus,0.989599


In [17]:
feature_selection(dt)

[1m Default Decision Tree [0m


Unnamed: 0,importance
Diagnosis,0.060218
FVC,0.173415
FEV1,0.275108
Performance,0.031587
Haemoptysis,0.036113
Dyspnoea,0.010917
Cough,0.045154
Tumor_Size,0.115499
Diabetes_Mellitus,0.024033
PAD,0.0137


In [18]:
feature_selection(dt_hyper)

[1m Modified Decision Tree [0m


Unnamed: 0,importance
Diagnosis,0.355261
FEV1,0.017528
Tumor_Size,0.350727
Diabetes_Mellitus,0.276484


In [19]:
feature_selection(rf)

[1m Default Random Forest [0m


Unnamed: 0,importance
Diagnosis,0.093818
FVC,0.209796
FEV1,0.172783
Performance,0.039512
Pain,0.026941
Haemoptysis,0.024254
Dyspnoea,0.02599
Cough,0.023649
Weakness,0.030346
Tumor_Size,0.086177


In [20]:
feature_selection(rf_hyper)

[1m Modified Random Forest [0m


Unnamed: 0,importance
FVC,0.2
FEV1,0.5
Age,0.1
