# Gaucher Disease Modeling

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from collections import Counter    
import re
from datetime import datetime

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import random
import xgboost as xgb

In [None]:
data = pd.read_csv('/home/jovyan/amedes_challenge/data/interim/data_preprocessed.csv')

In [None]:
Berlin = data[data.ZENTRUM_ID == 'BER01']
Frankfurt = data[data.ZENTRUM_ID == 'FRA01']
Hamburg = data[data.ZENTRUM_ID == 'HAM08']
Stuttgart = data[data.ZENTRUM_ID == 'STR01']

# Processing of lab results

In [None]:
def lab_processing(center, ICD):

    patients = center[(center.SICHERHEIT == 'G') & (center.ICD10 == ICD)].PATIENT_HASH.unique()
    center = center[center.PATIENT_HASH.isin(patients)]

    Y=center[center['TYP']=='Y'].dropna(subset=["TEXT"])
    Y=Y.drop_duplicates()
    
    def result_inference(x):
        result=x['Result']
        if r'(' in result:
            index=result.index(r'(')
            interval=result[index:]
            result=result[:index]
        if '--' in result:
            return 'Very low'
        elif '-' in result:
            return 'Low'
        elif '++' in result:
            return 'Very high'
        elif '+' in result:
            return 'High'
        elif 'negativ' in result.lower():
            return 'Negative'
        elif 'positiv' in result.lower():
            return 'Positive'
        return 'Normal'
    
    # Splitting rows with tests seperated by ';' eg HKT=43.0 %; MCV=87.4 fl; MCH=27.6 pg
    Y['text'] = Y['TEXT']
    Y=Y.set_index(['PATIENT_HASH', 'ZENTRUM_ID', 'PATIENT_ID', 'PAT_GEBDATUM',
           'PAT_GESCHLECHT', 'DATUM', 'TYP', 'TYP_EXT', 'text','ICD10',
           'SICHERHEIT'])
    Y=Y['TEXT'].str.split(';').explode().reset_index()
    Y['TEXT']=Y['TEXT'].str.replace(r'^\s+',r'')
    
    
    # Format LEUKO=4.5
    Y_2=Y[Y.TEXT.dropna().str.contains(r'[a-zA-Z0-9\s]+=[a-zA-Z0-9\s]+')]
    Y_2[['Lab_test','Result']]=Y_2['TEXT'].str.split('=',expand=True,n=1)
    Y_2['Inference']=Y_2.apply(result_inference,axis=1)
    Y_2['Inference'].value_counts() 
    Y_new=Y_2
        
    # Find tests that appear in the dataframe atleast 10 times
    counts=Y_new['Lab_test'].value_counts().reset_index()
    tests=list(set(counts[counts['Lab_test']>1]['index']))
    Y_new=Y_new[Y_new['Lab_test'].isin(tests)]

    # Group by Patient and Date and create two dictionaries of Lab_test:Inference, Lab_test:Result
    columns=['PATIENT_HASH', 'ZENTRUM_ID', 'PATIENT_ID', 'PAT_GEBDATUM',
           'PAT_GESCHLECHT', 'DATUM','TYP','text']
    Y_new=Y_new.set_index('Lab_test').groupby(columns)[['Inference','Result']].apply(lambda x: x.to_dict()).reset_index(name='Lab_results')
    Y_new['Inference_dict']=Y_new['Lab_results'].apply(lambda x: x['Inference'])
    Y_new['Result_dict']=Y_new['Lab_results'].apply(lambda x: x['Result'])
    Y_new=Y_new.drop('Lab_results',axis=1)
    
    # Convert Inference dictionary to multiple columns, one for each test
    results_df=Y_new["Inference_dict"].apply(pd.Series)
    Y_new=pd.concat([Y_new,results_df],axis=1)
    
    Y_new = Y_new.drop(columns=['PATIENT_ID','PAT_GEBDATUM','PAT_GESCHLECHT','Result_dict','Inference_dict']).rename(columns={"text":"TEXT"})
        
    return Y_new


# Gaucher disease 

In [None]:
ICD = 'E75.22'
Berlin = lab_processing(Berlin,ICD)
print("Berlin done")
Frankfurt = lab_processing(Frankfurt,ICD)
print("Frankfurt done")
Hamburg = lab_processing(Hamburg,ICD)
print('Hamburg done')
Stuttgart = lab_processing(Stuttgart,ICD)
print("Stuttgart done")

In [None]:
centers = [Berlin, Frankfurt, Hamburg, Stuttgart]
Y_new = pd.concat(centers)

In [None]:
patients = data[(data.SICHERHEIT == 'G') & (data.ICD10 == ICD)].PATIENT_HASH.unique()
diagnosed = data[data.PATIENT_HASH.isin(patients)]
diagnosed = diagnosed.merge(Y_new[Y_new.PATIENT_HASH.isin(patients)], 
                                                how='left', on=['PATIENT_HASH','ZENTRUM_ID','DATUM','TYP','TEXT'])
diagnosed = diagnosed.dropna(axis=1, how='all')

## Lab test results

In [None]:
x =  diagnosed[diagnosed.TYP == "Y"].replace("Normal","").fillna("") \
    .groupby('PATIENT_HASH')[np.array(diagnosed.columns[19:])].sum() \
    .replace(0,np.nan).dropna(how='all', axis=0).reset_index()

def value(x):
    if x=="":
        return x
    else:
        return ['Low','High','Very low','Very high','Normal','Positive','Negative'][
            np.argmax([x.count('Low'),x.count('High'),x.count('Very low'),x.count('Very high'),x.count('Normal'),
                      x.count('Positive'),x.count('Negative')])]

y = pd.concat([x.PATIENT_HASH,x.drop('PATIENT_HASH', axis=1).applymap(value)], axis=1)
y = pd.DataFrame(data=y.drop('PATIENT_HASH', axis=1).sum(axis=0), columns=['occurences'])

In [None]:
y['High'] = y['occurences'].apply(lambda x: str(x).count('High'))
y['Low'] = y['occurences'].apply(lambda x: str(x).count('Low'))
y['Positive'] = y['occurences'].apply(lambda x: str(x).count('Positive'))
y['Negative'] = y['occurences'].apply(lambda x: str(x).count('Negative'))
y['#'] = y[['High','Low','Positive','Negative']].sum(axis=1)
y['%'] = y[['High','Low','Positive','Negative']].max(axis=1)/y["#"]
y['Dominant'] = y[['High','Low','Positive','Negative']].apply(lambda x: ['High','Low','Positive','Negative'][x.argmax()],axis=1)

In [None]:
y[(y['#']>5) & (y["%"]>0.6)].sort_values(by="#", ascending=False)

In [None]:
relevant_tests = y[(y['#']>5) & (y["%"]>0.6)]['Dominant']
relevant_tests.columns = ['value']

In [None]:
relevant_tests

In [None]:
np.array(relevant_tests.index)

In [None]:
def relevant_test(row):
    tests = np.array(relevant_tests.index)
    count = 0
    for test in tests:
        if row[test] == relevant_tests.loc[test]:
            count = count + 1
    return count

diagnosed.loc[:,"relevant_tests"] = diagnosed.apply(relevant_test, axis=1)

In [None]:
diagnosed.groupby('PATIENT_HASH')['relevant_tests'].sum()

## Co-morbidity

In [None]:
#processing ICD
def ICD(row): 
    if (row['TYP'] in ['*','D']) & (pd.isnull(row['ICD10'])):
        if ('Hyperton' in row.TEXT) & ('art' in row.TEXT):
            return 'I10.90'
        if 'Hepatitis C' in row.TEXT:
            return 'B18.2'
        if 'Hypothyreose nach medizinischen Maßnahmen' in row.TEXT:
            return 'E89.0'
        if 'Sterilität beim Mann' in row.TEXT:
            return 'N46'
        if '3-Gefäß-KHK' in row.TEXT:
            return 'I25.13'
        if ('Vit' in row.TEXT) & ('D' in row.TEXT) & ('Mangel' in row.TEXT):
            return 'E55.9'
        if 'Anämie' in row.TEXT:
            return 'D64.9'
        if ('fatigue' in row.TEXT) & ('yndrom' in row.TEXT):
            return 'G93.3'
        if 'Obstruktive Bronchitis' in row.TEXT:
            return 'J44.89'
        else:
            return row['ICD10']
    else:
        return row['ICD10']
diagnosed['ICD10'] = diagnosed.apply(ICD, axis=1)

In [None]:
s = diagnosed[diagnosed.TYP.apply(lambda x: x in ["*","D"])][['PATIENT_HASH','ICD10']].groupby('ICD10')['PATIENT_HASH'].count(
).reset_index().sort_values(by='PATIENT_HASH',ascending=False)
s[s.PATIENT_HASH>3]

- E55.9: Vitamine D defficiency
- I10.90 : Hypertension
- D69.61 : Thrombocytopenia (abnormally low levels of platelets in the blood)
- G93.3 : Fatigue Syndrom
- R16.1 : Splenomegaly

In [None]:
coMorbidity = ['E55.9','E55.9','I10.90','D69.61','G93.3','R16.1']
diagnosed[diagnosed.ICD10.apply(lambda x: x in coMorbidity)].groupby('PATIENT_HASH')['ICD10'].nunique().reset_index()

## Symptoms

In [None]:
symptoms = ["müde","Fatique","Erschöpfung", "fatigue", "Fatigue", #fatigue
            "Knochenschmerzen","Knochenstoffwechselstörung", # bone pain
            "Milzläsionen", "Splenomegalie","splenomegalie", "Splenektomie", "Hepatosplenomegalie", # spenomagalie
            "Thrombopenie","Thrombozytopenie", "Chololithiasis","Chitotriosidase",
            "Anämie","Leukopenie","Panzytopenie","Niereninsuffizienz","Nephrolithiasis"]
diagnosed['is_symptom'] = diagnosed.TEXT.apply(lambda x: sum([t*1  in x for t in symptoms]))

In [None]:
a = diagnosed[diagnosed.is_symptom>0].sort_values(by="is_symptom", ascending=0)
a.groupby('PATIENT_HASH')['is_symptom'].sum().reset_index()

## Modelling

In [None]:
def lab_results_processing(data):

    Y=data[data['TYP']=='Y'].dropna(subset=["TEXT"])
    Y=Y.drop_duplicates()
    
    def result_inference(x):
        result=x['Result']
        if r'(' in result:
            index=result.index(r'(')
            interval=result[index:]
            result=result[:index]
        if '--' in result:
            return 'Very low'
        elif '-' in result:
            return 'Low'
        elif '++' in result:
            return 'Very high'
        elif '+' in result:
            return 'High'
        elif 'negativ' in result.lower():
            return 'Negative'
        elif 'positiv' in result.lower():
            return 'Positive'
        return 'Normal'
    
    # Splitting rows with tests seperated by ';' eg HKT=43.0 %; MCV=87.4 fl; MCH=27.6 pg
    Y['text'] = Y['TEXT']
    Y=Y.set_index(['PATIENT_HASH', 'ZENTRUM_ID', 'PATIENT_ID', 'PAT_GEBDATUM',
           'PAT_GESCHLECHT', 'DATUM', 'TYP', 'TYP_EXT', 'text','ICD10',
           'SICHERHEIT'])
    Y=Y['TEXT'].str.split(';').explode().reset_index()
    Y['TEXT']=Y['TEXT'].str.replace(r'^\s+',r'')
    
    
    # Format LEUKO=4.5
    Y_2=Y[Y.TEXT.dropna().str.contains(r'[a-zA-Z0-9\s]+=[a-zA-Z0-9\s]+')]
    Y_2[['Lab_test','Result']]=Y_2['TEXT'].str.split('=',expand=True,n=1)
    Y_2['Inference']=Y_2.apply(result_inference,axis=1)
    Y_2['Inference'].value_counts() 
    Y_new=Y_2
        
    # Find tests that appear in the dataframe atleast 10 times
    counts=Y_new['Lab_test'].value_counts().reset_index()
    tests=list(set(counts[counts['Lab_test']>1]['index']))
    Y_new=Y_new[Y_new['Lab_test'].isin(tests)]

    # Group by Patient and Date and create two dictionaries of Lab_test:Inference, Lab_test:Result
    columns=['PATIENT_HASH', 'ZENTRUM_ID', 'PATIENT_ID', 'PAT_GEBDATUM',
           'PAT_GESCHLECHT', 'DATUM','TYP','text']
    Y_new=Y_new.set_index('Lab_test').groupby(columns)[['Inference','Result']].apply(lambda x: x.to_dict()).reset_index(name='Lab_results')
    Y_new['Inference_dict']=Y_new['Lab_results'].apply(lambda x: x['Inference'])
    Y_new['Result_dict']=Y_new['Lab_results'].apply(lambda x: x['Result'])
    Y_new=Y_new.drop('Lab_results',axis=1)
    
    # Convert Inference dictionary to multiple columns, one for each test
    results_df=Y_new["Inference_dict"].apply(pd.Series)
    Y_new=pd.concat([Y_new,results_df],axis=1)
    
    Y_new = Y_new.drop(columns=['PATIENT_ID','PAT_GEBDATUM','PAT_GESCHLECHT','Result_dict','Inference_dict']).rename(columns={"text":"TEXT"})
        
    return Y_new


In [None]:
def modelling(data, k):
    patients = data[(data.SICHERHEIT == 'G') & (data.ICD10 == 'E75.22')].PATIENT_HASH.unique()

    subset_patients = np.concatenate((np.array(random.choices(data[(~data.PATIENT_HASH.isin(patients)) 
                                                                   & (data['is_E75.22']==0)].PATIENT_HASH.unique(), k=k)),
                                      patients))
    subset = data[data.PATIENT_HASH.isin(subset_patients)].reset_index(drop=True)

    Y_new = lab_results_processing(subset)

    subset = subset.merge(Y_new, how='left', on=['PATIENT_HASH','ZENTRUM_ID','DATUM','TYP','TEXT'])
    subset = subset.dropna(axis=1, how='all')
    
    # relevant tests
    x = subset[['PATIENT_HASH','TYP','ERY', 'HB', 'HKT', 'MCH', 'THRO', 'AP', 'GGT', 'FERR', 'NEUT',
           'FKAP', 'FKALAQ', 'CRP', 'INSU', 'OSTE', 'VD25', 'TFS', 'ALBUMA',
           'A1GLOA', 'A2GLOA', 'DPD']]
    x =  x[x.TYP == "Y"].fillna("") \
        .groupby('PATIENT_HASH')[np.array(x.columns[2:])].sum() \
        .dropna(how='all', axis=0).reset_index()

    def value(x):
        if x=="":
            return "no_test"
        else:
            return ['Low','High','Very low','Very high','Normal'][
                np.argmax([x.count('Low'),x.count('High'),x.count('Very low'),x.count('Very high'),x.count('Normal')])]

    test = pd.concat([x.PATIENT_HASH,x.drop('PATIENT_HASH', axis=1).applymap(value)], axis=1)

    from sklearn.preprocessing import OneHotEncoder

    enc = OneHotEncoder()
    enc.fit_transform(test.drop('PATIENT_HASH',axis=1))
    relevantTests = ['ERY', 'HB', 'HKT', 'MCH', 'THRO', 'AP', 'GGT', 'FERR', 'NEUT','FKAP', 'FKALAQ', 'CRP', 
                     'INSU', 'OSTE', 'VD25', 'TFS', 'ALBUMA','A1GLOA', 'A2GLOA', 'DPD']
    tests = pd.concat([test.PATIENT_HASH, pd.DataFrame(data = enc.fit_transform(test.drop('PATIENT_HASH',axis=1)).toarray(),
                columns = enc.get_feature_names(relevantTests))], axis = 1)

    # co-morbidity
    def ICD(row): 
        if (row['TYP'] in ['*','D']) & (pd.isnull(row['ICD10'])):
            if ('Hyperton' in row.TEXT) & ('art' in row.TEXT):
                return 'I10.90'
            if 'Hepatitis C' in row.TEXT:
                return 'B18.2'
            if 'Hypothyreose nach medizinischen Maßnahmen' in row.TEXT:
                return 'E89.0'
            if 'Sterilität beim Mann' in row.TEXT:
                return 'N46'
            if '3-Gefäß-KHK' in row.TEXT:
                return 'I25.13'
            if ('Vit' in row.TEXT) & ('D' in row.TEXT) & ('Mangel' in row.TEXT):
                return 'E55.9'
            if 'Anämie' in row.TEXT:
                return 'D64.9'
            if ('fatigue' in row.TEXT) & ('yndrom' in row.TEXT):
                return 'G93.3'
            if 'Obstruktive Bronchitis' in row.TEXT:
                return 'J44.89'
            else:
                return row['ICD10']
        else:
            return row['ICD10']
    subset['ICD10'] = subset.apply(ICD, axis=1)
    codes = ['E55.9','I10.90','D69.61','G93.3','R16.1']

    coMorbidity = pd.DataFrame(tests.PATIENT_HASH)
    for code in codes:
        coMorbidity[code] = pd.merge(pd.DataFrame(tests.PATIENT_HASH),
                                     subset[subset.ICD10 == code].groupby('PATIENT_HASH')['ICD10'].nunique().reset_index(),
                                     how="left", on ="PATIENT_HASH").fillna(0).drop("PATIENT_HASH", axis=1)

    # symptoms
    words = [["Fatigue", "müde","Fatique","Erschöpfung", "fatigue"], #fatigue
                ["Knochenschmerzen","Knochenstoffwechselstörung"], # bone pain
                ["Splenomegalie","Milzläsionen", "splenomegalie", "Splenektomie", "Hepatosplenomegalie"], # spenomagalie
                ["Thrombopenie","Thrombozytopenie"], ["Chololithiasis"],["Chitotriosidase"],
                ["Anämie"],["Leukopenie"],["Panzytopenie"],["Niereninsuffizienz"],["Nephrolithiasis"]]

    symptoms = pd.DataFrame(data = subset.PATIENT_HASH.unique(), columns = ['PATIENT_HASH'])

    for word in words:
        symptoms[word[0]] = pd.merge(pd.DataFrame(data = subset.PATIENT_HASH.unique(), columns = ['PATIENT_HASH']),
                                     subset[subset.TEXT.apply(lambda x: any(t in x for t in word))].groupby('PATIENT_HASH')['PATIENT_ID'].count().reset_index(),
                                     how="left", on ="PATIENT_HASH").fillna(0).drop("PATIENT_HASH", axis=1)

    # age
    age = subset.groupby('PATIENT_HASH')['age'].mean().reset_index()

    dataset = subset[['PATIENT_HASH','PAT_GESCHLECHT']].drop_duplicates().replace("W",1).replace("M",0).reset_index(drop=True)
    dataset['gaucher'] = dataset.PATIENT_HASH.isin(patients)*1
    dataset = pd.merge(dataset, age , how="left", on='PATIENT_HASH')
    dataset = pd.merge(dataset, tests , how="left", on='PATIENT_HASH')
    dataset = pd.merge(dataset, coMorbidity , how="left", on='PATIENT_HASH')
    dataset = pd.merge(dataset, symptoms , how="left", on='PATIENT_HASH')
    dataset = dataset.fillna(0).drop('PATIENT_HASH', axis=1)
    
    X = dataset.drop('gaucher', axis=1)
    y = dataset.gaucher

    names = ["Nearest Neighbors", "Logistic Regression", 
             "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
             "Naive Bayes"]

    classifiers = [
        KNeighborsClassifier(3),
        LogisticRegression(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        MLPClassifier(),
        AdaBoostClassifier(),
        GaussianNB()]

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        scores = cross_val_score(clf, X, y, cv=10)
        y_pred = cross_val_predict(clf, X, y, cv=10)
        CM = confusion_matrix(y, y_pred)
        TN = CM[0][0] 
        FN = CM[1][0]
        TP = CM[1][1]
        FP = CM[0][1]
        print(name,np.round(np.mean(scores),3))
        print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
        print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)
        
    return X,y

In [None]:
# with 32 non Gaucher patients
modelling(data, 32)

In [None]:
# with 64 non Gaucher patients
modelling(data, 64)

In [None]:
# with 96 non Gaucher patients
modelling(data, 96)

In [None]:
# with 128 non Gaucher patients
modelling(data, 128)

In [None]:
# with 160 non Gaucher patients
modelling(data, 160)

In [None]:
Berlin = data[data.ZENTRUM_ID == 'BER01']
Frankfurt = data[data.ZENTRUM_ID == 'FRA01']
Hamburg = data[data.ZENTRUM_ID == 'HAM08']
Stuttgart = data[data.ZENTRUM_ID == 'STR01']

for center, name in zip([Berlin, Frankfurt, Hamburg, Stuttgart],['Berlin', 'Frankfurt', 'Hamburg', 'Stuttgart']):
    print(name,": ",
          100 * round(center[(center.SICHERHEIT == 'G') & (center.ICD10 == 'E75.22')].PATIENT_HASH.nunique()/center.PATIENT_HASH.nunique(),5),
         '% Gaucher')

In [None]:
32/0.00017

In [None]:
# with 1880 non Gaucher patients
X,y = modelling(data, 1880)

In [None]:
crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)
ada=AdaBoostClassifier()
search_grid={'n_estimators':[200,500,1000,2000],'learning_rate':[.001,.01,.1,.2]}
search=GridSearchCV(estimator=ada,param_grid=search_grid,scoring='recall_weighted',n_jobs=1,cv=crossvalidation)
search.fit(X,y)
search.best_params_

In [None]:
search.best_score_

recall_score: measures the ability of a classifier to find all the positive samples

In [None]:
clf = AdaBoostClassifier(n_estimators=200, learning_rate=0.1)
name = "AdaBoost"
scores = cross_val_score(clf, X, y, cv=10)
y_pred = cross_val_predict(clf, X, y, cv=10)
CM = confusion_matrix(y, y_pred)
TN = CM[0][0] 
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print(name,np.round(np.mean(scores),3))
print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)

### Imbalances dataset

In [None]:
import imblearn
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

In [None]:
# define model
model = AdaBoostClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
print("baseline", np.mean(cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)))

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
y_pred = cross_val_predict(model, X, y,  cv=10, n_jobs=-1)
CM = confusion_matrix(y, y_pred)
TN = CM[0][0] 
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)

In [None]:
# define resampling
resample = SMOTEENN()
# define pipeline
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='recall', cv=cv, n_jobs=-1)
# summarize performance
print('Score: %.3f' % np.mean(scores))

In [None]:
y_pred = cross_val_predict(pipeline, X, y,  cv=10, n_jobs=-1)
CM = confusion_matrix(y, y_pred)
TN = CM[0][0] 
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print(name,np.round(np.mean(scores),3))
print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)

In [None]:
crossvalidation=KFold(n_splits=10,shuffle=True,random_state=1)
ada=AdaBoostClassifier()
search_grid={'m__n_estimators':[200,500,1000,2000],
             'm__learning_rate':[.001,.01,.1,.2]}
search=GridSearchCV(pipeline,param_grid=search_grid,scoring='recall',n_jobs=1,cv=crossvalidation)
search.fit(X,y)
search.best_params_

In [None]:
model = AdaBoostClassifier(learning_rate=0.001,n_estimators=200)
pipeline = Pipeline(steps=[('r', resample), ('m', model)])
y_pred = cross_val_predict(pipeline, X, y,  cv=10, n_jobs=-1)
CM = confusion_matrix(y, y_pred)
TN = CM[0][0] 
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
print(name,np.round(np.mean(scores),3))
print("      Positives: ", 100*round(FN/(TP+FN),2), "% misclassifed     ", FN, '/',TP+FN)
print("      Negatives: ", 100*round(FP/(TN+FP),2), "% misclassifed     ",FP, '/',TN+FP)