In [2]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import seaborn as sns
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.inspection import permutation_importance



# read prenatal_features.csv
df = pd.read_csv('premature_birth/prenatal_features.csv')
df = df.replace('#NULL!', np.nan)
df = df.dropna(how='any')

# plot premature birth distribution
plt.figure(figsize=(12, 6))
sns.countplot(df['premature_birth'], palette='Blues')
plt.title('Dependent variable distribution plot')
plt.xlabel('Premature Birth')
df['premature_birth'].value_counts()
plt.show()
df_all = df
df = df.astype(np.float)
print("SHAPE")
print(df.shape)


corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

plt.show()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
losses = ['deviance', 'exponential']
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'class_weight': [{0: 1, 1: 9}]

}
print(random_grid)


# use just genetics
prenatal_features = ['TNFa1', 'TNFa2', 'TNFa3', 'TNFa4', 'TNFa5', 'FOXP3.1', 'FOXP3.2', 'FOXP3.3', 'FOXP3.4', 'FOXP3.5', 'FOXP3.6',
                     'FOXP3.7', 'OXTRmeth', 'premature_birth']
df_genetics = df[prenatal_features]


# just psychological
prenatal_features = ['total_score_phq', 'epds_score', 'idas_total', 'idas_gen_depression',
                     'idas_dysphoria', 'idas_lassitude', 'idas_suicidality', 'idas_appet_gain', 'idas_appet_loss', 'idas_well_being',
                     'idas_ill_temper', 'idas_mania', 'idas_euphoria', 'idas_panic', 'idas_social_anxiety',
                     'idas_claustrohobia', 'idas_traumatic_intrusions', 'idas_traumatic_avoidance', 'idas_checking', 'idas_cleaning',
                     'idas_ordering', 'total_score_gad_7', 'bas_total', 'acs_total', 'eds_total', 'ehm_total', 'premature_birth']

df_psychological = df[prenatal_features]


# stress genes
stress_genes = ['BDNF_CpG1', 'BDNF_CpG2', 'BDNF_CpG3', 'BDNF_CpG4', 'BDNF_CpG5', 'BDNF_CpG6', 'BDNF_CpG7',
                'FKBP5_CpG1', 'FKBP5_CpG2', 'Nr3c1_CpG1', 'Nr3c1_CpG2', 'Nr3c1_CpG3', 'Nr3c1_CpG4', 'premature_birth']


df_stress = df_all[stress_genes]

analysis_features = ['acs_total', 'FOXP3.7', 'BDNF_CpG1', 'BDNF_CpG4', 'BDNF_CpG5', 'BDNF_CpG6', 'OXTRmeth', 'total_score_gad_7']
df_acculturation = df_all[analysis_features]
experiment_features = ['bas_total', 'TNFa1', 'TNFa2', 'TNFa3', 'TNFa4', 'TNFa5', 'idas_gen_depression']
df_bas = df_all[experiment_features]

# final_features = ['acs_total', 'bas_total', 'BDNF_CpG1', 'BDNF_CpG2', 'BDNF_CpG3', 'BDNF_CpG4',
#                   'BDNF_CpG5', 'BDNF_CpG6', 'BDNF_CpG7', 'FKBP5_CpG1', 'FKBP5_CpG2', 'Nr3c1_CpG1',
#                   'Nr3c1_CpG2', 'Nr3c1_CpG3', 'Nr3c1_CpG4', 'TNFa1', 'TNFa2', 'TNFa3', 'TNFa4',
#                   'TNFa5', 'FOXP3.1', 'FOXP3.2', 'FOXP3.3', 'FOXP3.4', 'FOXP3.5', 'FOXP3.6',
#                      'FOXP3.7', 'OXTRmeth', 'idas_gen_depression',
#                   'total_score_phq', 'eds_total', 'ehm_total', 'total_score_gad_7',
#                   'premature_birth']

#
# final_features = ['acs_total', 'bas_total', 'BDNF_CpG1', 'BDNF_CpG2', 'BDNF_CpG3', 'BDNF_CpG4',
#                   'BDNF_CpG5', 'BDNF_CpG6', 'BDNF_CpG7', 'FKBP5_CpG1', 'FKBP5_CpG2', 'Nr3c1_CpG1',
#                   'Nr3c1_CpG2', 'Nr3c1_CpG3', 'Nr3c1_CpG4', 'TNFaAv', 'FOXP3.av', 'OXTRmeth', 'idas_gen_depression',
#                   'total_score_phq', 'eds_total', 'ehm_total', 'total_score_gad_7',
#                   'premature_birth']

experiment_features = [
    'BDNF_CpG1', 'BDNF_CpG2', 'BDNF_CpG3', 'BDNF_CpG4', 'BDNF_CpG5', 'BDNF_CpG6', 'BDNF_CpG7',
    'FKBP5_CpG1', 'FKBP5_CpG2', 'Nr3c1_CpG1', 'Nr3c1_CpG2', 'Nr3c1_CpG3', 'Nr3c1_CpG4',
    'TNFaAv', 'FOXP3.av', 'OXTRmeth', 'acs_total', 'bas_total', 'total_score_phq', 'eds_total', 'ehm_total', 'total_score_gad_7', 'epds_score',
    'idas_gen_depression', 'premature_birth', 'idas_dysphoria', 'idas_lassitude', 'idas_suicidality', 'idas_appet_gain', 'idas_appet_loss', 'idas_well_being',
    'idas_ill_temper', 'idas_mania', 'idas_euphoria', 'idas_panic', 'idas_social_anxiety',
     'idas_claustrohobia', 'idas_traumatic_intrusions', 'idas_traumatic_avoidance', 'idas_checking', 'idas_cleaning']

df_bas = df_all[experiment_features]

df_all = df_all.astype(np.float)
# df_bas['BDNF_Avg'] = df_all[['BDNF_CpG1', 'BDNF_CpG2', 'BDNF_CpG3', 'BDNF_CpG4', 'BDNF_CpG5', 'BDNF_CpG6', 'BDNF_CpG7']].mean(axis=1)
# df_bas['FKBP5_Avg'] = df_all[['FKBP5_CpG1', 'FKBP5_CpG2']].mean(axis=1)
# df_bas['Nr3c1_Avg'] = df_all[['Nr3c1_CpG1', 'Nr3c1_CpG2', 'Nr3c1_CpG3', 'Nr3c1_CpG4']].mean(axis=1)

# df_bas_final = ['acs_total', 'bas_total', 'BDNF_Avg', 'FKBP5_Avg', 'Nr3c1_Avg', 'TNFaAv', 'FOXP3.av', 'OXTRmeth', 'idas_gen_depression',
#                 'total_score_phq', 'eds_total', 'ehm_total', 'total_score_gad_7', 'epds_score',
#                 'premature_birth']

df_bas_final = ['acs_total', 'bas_total', 'BDNF_CpG1', 'BDNF_CpG2', 'BDNF_CpG3', 'BDNF_CpG4', 'BDNF_CpG5', 'BDNF_CpG6', 'BDNF_CpG7',
    'FKBP5_CpG1', 'FKBP5_CpG2', 'Nr3c1_CpG1', 'Nr3c1_CpG2', 'Nr3c1_CpG3', 'Nr3c1_CpG4', 'TNFaAv', 'FOXP3.av', 'OXTRmeth', 'idas_gen_depression',
                'total_score_phq', 'eds_total', 'ehm_total', 'total_score_gad_7', 'epds_score',
                'premature_birth']

df_bas_a = df_bas[df_bas_final]

df_final = df_bas_a
df_final = df_final.astype(np.float)


# all_df = [df_all, df_genetics, df_stress, df_psychological]
all_df = [df_final]
for frame in all_df:
    X = frame.drop(['premature_birth'], axis=1)
    y = frame['premature_birth']

    accuracies = []
    # 10fold validation
    X_all, X_test_all, y_all, y_test_all = train_test_split(X, y, test_size=0.2, stratify=y)
    gtb = RandomForestClassifier(class_weight={0: 1, 1: 15})
    rf_random = RandomizedSearchCV(estimator=gtb, param_distributions=random_grid, n_iter=100, cv=10,
                                   verbose=2, random_state=42, n_jobs=-1, return_train_score=True, refit=True,
                                   )
    rf_random.fit(X_all, y_all)
    print('Best Parameters')
    print(rf_random.best_params_)
    print(rf_random.best_score_)

    best_random = rf_random.best_estimator_




    pred = best_random.predict(X_test_all)
    print('Accuracy of the GBM on test set: {:.3f}'.format(best_random.score(X_test_all, y_test_all)))
    print(classification_report(y_test_all, pred))
    print('ROC', roc_auc_score(y_test_all, pred))

    feat_importances = pd.Series(best_random.feature_importances_, index=X.columns)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.title('Model')
    plt.show()

    result = permutation_importance(best_random, X_test_all, y_test_all, n_repeats=10,
                                    random_state=42, n_jobs=2)
    sorted_idx = result.importances_mean.argsort()

    fig, ax = plt.subplots()
    ax.boxplot(result.importances[sorted_idx].T,
               vert=False, labels=X_test_all.columns[sorted_idx])
    ax.set_title("Permutation Importances (test set)")
    fig.tight_layout()
    plt.show()

ImportError: cannot import name 'permutation_importance' from 'sklearn.inspection' (/Users/amishajindal28/.pyenv/versions/3.7.4/lib/python3.7/site-packages/sklearn/inspection/__init__.py)