In [None]:
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import Dense

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
from pprint import pprint

from sklearn.metrics import confusion_matrix, classification_report, f1_score, auc, roc_curve, roc_auc_score, precision_score, recall_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold

from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.cluster import KMeans

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from collections import Counter

from tensorflow.python.keras.utils import losses_utils
from IPython.display import clear_output

%matplotlib inline
print('Libraries Imported')

In [None]:
#helper function to load the dataset and separate features from response, and exclude certain columns
def load_dataset(filename, response_col, exclude_cols, which, perc):
    # load the dataset as a pandas DataFrame
    data =  pd.read_csv(filename ,sep = ';',decimal = ',', encoding = 'unicode_escape', engine ='python')
    #exclude columns that need to be excluded
    data = data.drop(exclude_cols, axis=1)
    #convert all "object" columns to numeric (according to data description !!)
    to_convert = data.select_dtypes('object').columns
    data.loc[:,to_convert] = data[to_convert].apply(pd.to_numeric, downcast='float', errors='coerce')
    
    if which == 0:
        #eliminate class 3
        data = data[data[response_col] < 3]
    elif which == 1:
        #merge class 1 and 2
        for i in range(0,len(data)):
            if data[response_col][i] == 1:
                data.loc[i, response_col] = 2
    elif which == 2:
        #merge class 1, 2 and 3
        for i in range(0,len(data)):
            if data[response_col][i] == 1 or data[response_col][i] == 2:
                data.loc[i, response_col] = 3
    else:
        data = data
    #delete rows with na
    min_count = int(((100-perc)/100)*data.shape[0] + 1)
    data = data.dropna(axis = 1, how = 'any', thresh = min_count, subset = None, inplace = False)
#     data['index'] = data.index
    # split into input (X) and output (y) variables
    data = data.dropna()
    X = data[data.columns.difference([response_col])]
    if which == 0:
        y = data[response_col] -2  #made the output from 0 to 1 
    elif which == 1:
        y = data[response_col] -2 #made the output from 0 to 1 (0 for class three and 1 for 1 and 2)
    elif which == 2:
        y = data[response_col] -3
    else:
        y = data[response_col] -1
    return data, X, y

In [None]:
which = 1 # 0 for 1 vs 2, 1 for 3 vs rest, 2 for c3 (1+2+3 vs 4), else/3 for exact import
perc = 40

# data, X, Y = load_dataset('/home/gunjan28/Documents/UEF/input/HTx_trajektoridata_030921.csv', "c1", 
#                            ["t2d_diag","c2","c3","t2d_vuosi", "aika_diag", "pkrea_mitt", "hba1c_mitt", 
#                             "ldl_mitt", "bmi_mitt", "ei_mitaan0","pkrea_luo","ldl_luo","pelkka_ins0","te_takk",
#                             'bmi',"viimeisin_tk","yleisin_tk","c1_cprob1","c1_cprob2", "c1_cprob3", "c2_cprob1",
#                             "c2_cprob2", "c2_cprob3", "c2_cprob4","ï»¿id","c3_prob1","c3_prob2","c3_prob3",
#                             "c3_prob4","D10A","G02B","H05A","J04B","J06B","L01A","M01C","N06B","N07X",
#                             "bmi_mitt_old"], which, perc)

# data, X, Y = load_dataset('/home/gunjan28/Documents/UEF/input/HTx_trajektoridata_030921.csv', "c1",
#                           ["c2","c3","ï»¿id",
#                            "t2d_diag","viimeisin_tk","yleisin_tk",
#                            "c1_cprob1","c1_cprob2", "c1_cprob3",
#                            "c2_cprob1","c2_cprob2", "c2_cprob3", "c2_cprob4",
#                            "c3_prob1","c3_prob2","c3_prob3","c3_prob4",
#                            "D10A","G02B","H05A","J04B","J06B","L01A","M01C","N06B","N07X","bmi_mitt_old"],
#                             which, perc)

data, X, Y = load_dataset('/home/gunjan28/Documents/UEF/input/HTx_trajektoridata_updated_131021.csv', "c4",
                          ["c5","ï»¿ID","c4_p1","c4_p2","c4_p3",
                           "c5_p1","c5_p2","c5_p3","c5_p4",
                           "D10A","G02B","H05A","J04B","J06B","L01A","M01C","N06B","N07X"],
                            which, perc)

# "c4_p1","c4_p2","c4_p3",

print(Counter(Y))
X

In [None]:
# probabilities = data.loc[:,['c4_p1','c4_p2','c4_p3']]
# probabilities

In [None]:
# X.isna().sum().reset_index(name = 'n').plot.bar(x = 'index', y = 'n', rot = 45)

In [None]:
# # Correlation
# X = pd.DataFrame(X)
# # corr = X.corr(method ='pearson')

# # corr = X.corr(method ='pearson').abs()
# corr = X.corr(method ='spearman').abs()
# avg_corr = corr.mean(axis = 1)

In [None]:
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# print(pd.DataFrame(corr.loc['viim_hba1c_bl']))
# pd.set_option("display.max_rows", 11, "display.max_columns", 20)

In [None]:
# f = plt.figure(figsize=(19, 15))
# plt.matshow(corr, fignum=f.number)
# cb = plt.colorbar()
# cb.ax.tick_params(labelsize=14)
# plt.title('Correlation Matrix', fontsize=16);

In [None]:
# upper_tri = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool))
# # print(upper_tri)
# to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.4)] #8
# print(to_drop)
# print(len(to_drop))
# # X = X.drop(X[to_drop], axis=1)
# data = data.drop(data[to_drop], axis=1)

In [None]:
def corrX_new(df, cut = 0.4) :
       
    # Get correlation matrix and upper triagle
    corr_mtx = df.corr().abs()
    avg_corr = corr_mtx.mean(axis = 1)
    up = corr_mtx.where(np.triu(np.ones(corr_mtx.shape), k=1).astype(np.bool))
    
    dropcols = list()
    
    res = pd.DataFrame(columns=(['v1', 'v2', 'v1.target', 
                                 'v2.target','corr', 'drop' ]))
    
    for row in range(len(up)-1):
        col_idx = row + 1
        for col in range (col_idx, len(up)):
            if(corr_mtx.iloc[row, col] > cut):
                if(avg_corr.iloc[row] > avg_corr.iloc[col]): 
                    dropcols.append(row)
                    drop = corr_mtx.columns[row]
                else: 
                    dropcols.append(col)
                    drop = corr_mtx.columns[col]
                
                s = pd.Series([ corr_mtx.index[row],
                up.columns[col],
                avg_corr[row],
                avg_corr[col],
                up.iloc[row,col],
                drop],
                index = res.columns)
        
                res = res.append(s, ignore_index = True)
    
    dropcols_names = calcDrop(res)
    
    return(dropcols_names)

def calcDrop(res):
    # All variables with correlation > cutoff
    all_corr_vars = list(set(res['v1'].tolist() + res['v2'].tolist()))
    
    # All unique variables in drop column
    poss_drop = list(set(res['drop'].tolist()))

    # Keep any variable not in drop column
    keep = list(set(all_corr_vars).difference(set(poss_drop)))
     
    # Drop any variables in same row as a keep variable
    p = res[ res['v1'].isin(keep)  | res['v2'].isin(keep) ][['v1', 'v2']]
    q = list(set(p['v1'].tolist() + p['v2'].tolist()))
    drop = (list(set(q).difference(set(keep))))

    # Remove drop variables from possible drop 
    poss_drop = list(set(poss_drop).difference(set(drop)))
    
    # subset res dataframe to include possible drop pairs
    m = res[ res['v1'].isin(poss_drop)  | res['v2'].isin(poss_drop) ][['v1', 'v2','drop']]
        
    # remove rows that are decided (drop), take set and add to drops
    more_drop = set(list(m[~m['v1'].isin(drop) & ~m['v2'].isin(drop)]['drop']))
    for item in more_drop:
        drop.append(item)
         
    return drop

In [None]:
drop_new = corrX_new(X, cut = 0.4)
print(len(drop_new))
print(drop_new)
X = X.drop(drop_new, axis = 1)
X

In [None]:
# feature selection (MI) Try: Sequential forward/floating/backward selection
def select_features(select, feature, NN = True):
    selected = {}
    fs = SelectKBest(score_func=mutual_info_classif, k=feature)
    selected[0] = fs.fit_transform(select[0], select[1])
    selected[1] = fs.transform(select[2])
    if NN == True:
        selected[2] = fs.transform(select[3])
    
    idx = fs.get_support(indices=True)
    
    return selected, fs, idx

def create_custom_model(input_dim, output_dim, nodes, n=1, name='model'):
#     print(n)
    def create_model():
        # Create model
        model = Sequential(name=name)
        for i in range(n):
            model.add(Dense(nodes, input_dim=input_dim, activation='sigmoid'))
        model.add(Dense(output_dim, activation='sigmoid'))

        # Compile model
        model.compile(loss=loss_fn, 
                      optimizer='SGD', # 'RMSprop', #'adam', 
                      metrics=['accuracy'])
        return model
    return create_model

def classification_repo(true, pred):
    for i in range(layer_1_2-layer_1_1):
        print("Classification report for model with {} hidden layers".format(i+layer_1_1))
        print(classification_report(true, pred[i]))
        
#confusion matrix
def conf_mat(true,pred):
    for i in range(layer_1_2-layer_1_1):
        print("Confussion matrix for model with {} hidden layers".format(i+layer_1_1))
        confusion_matrix_df = pd.DataFrame(confusion_matrix(true, pred[i]))
        plt.figure()
        sns.heatmap(confusion_matrix_df, annot=True)
        plt.show()
        # print(confusion_matrix_df)

In [None]:
shape = np.shape(X)
feature = 50# shape[1]
layer_1_1 = 1
layer_1_2 = 2
bs = 16
epochs = 100
nodes = 1024/4 #1024/4
sample = 0
n_classes = 2

In [None]:
from numpy.random import seed
seed(67)
import tensorflow as tf
tf.random.set_seed(38)
from sklearn.impute import SimpleImputer, KNNImputer

# Save original data set
original = X

# data = data[data.columns.difference(['c4'])]

# Split into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y)

# prob_test = X_test[["c4_p1","c4_p2","c4_p3"]]
# X_test = X_test[X_test.columns.difference(["c4_p1","c4_p2","c4_p3"])]
# X_train = X_train[X_train.columns.difference(["c4_p1","c4_p2","c4_p3"])]

In [None]:
# Imputation
Train = X_train.copy()
Train.loc[:,'c4'] = Y_train

datasets= {}
by_class = Train.groupby('c4')
for groups, data in by_class:
    datasets[groups] = data

X_train_0 = datasets[0][datasets[0].columns.difference(['c4'])]
Y_train_0 = datasets[0].loc[:,'c4'].copy()
X_train_1 = datasets[1][datasets[1].columns.difference(['c4'])]
Y_train_1 = datasets[1].loc[:,'c4'].copy()

imp_0 = SimpleImputer(strategy = "mean")
X_train_0 = imp_0.fit_transform(X_train_0)

imp_1 = SimpleImputer(strategy = "mean")
X_train_1 = imp_1.fit_transform(X_train_1)

X_train = np.concatenate((X_train_0,X_train_1))
Y_train = np.concatenate((Y_train_0,Y_train_1))

# Remove NA from test
Test = X_test.copy()
Test.loc[:,'c4'] = Y_test
Test = Test.dropna()
X_test = Test[Test.columns.difference(['c4'])]
Y_test = Test.loc[:,'c4'].copy()
print('Samples in test set after seperated select:',Counter(Y_test))

# Normalize numeric features
scaler = StandardScaler()
# scaler = MinMaxScaler()
select = {}
select[0] = pd.DataFrame(scaler.fit_transform(X_train))
select[1] = Y_train
select[2] = pd.DataFrame(scaler.transform(X_test))

# Feature selection
selected, fs, idx = select_features(select, feature, NN = False)
X_train = pd.DataFrame(selected[0])
X_test = pd.DataFrame(selected[1])

# Get columns to keep and create new dataframe with those only
cols = fs.get_support(indices=True)
features_df_new = original.iloc[:,cols]
pprint(features_df_new.columns)

In [None]:
#Xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

model=XGBClassifier(use_label_encoder=False, eta = 0.1,#eta between(0.01-0.2)
        max_depth = 9, #values between(3-10)
        max_delta_step = 1,
        subsample = 0.7,#values between(0.5-1)
        colsample_bytree = 0.7,#values between(0.5-1)
        tree_method = "auto",
        #scale_pos_weight = unbalanced,
        process_type = "default",
        num_parallel_tree=3,
        objective='multi:softmax',
        min_child_weight = 3,
        booster='gbtree',
#         deterministic_histogram = False,
#         sample_type = "uniform",
        eval_metric = "mlogloss",
        num_class = n_classes)
model.fit(X_train,Y_train)

xgb_pred=model.predict(X_test)
print(accuracy_score(Y_test, xgb_pred)*100)

confusion_matrix_xgb = pd.DataFrame(confusion_matrix(Y_test, xgb_pred))
sns.heatmap(confusion_matrix_xgb, annot=True)
print(classification_report(Y_test, xgb_pred))

In [None]:
enc = OneHotEncoder()
y_train = enc.fit_transform(Y_train[:, np.newaxis]).toarray()
y_test = enc.transform(Y_test[:, np.newaxis]).toarray()

loss_fn = tf.keras.losses.BinaryCrossentropy()

n_features = X_train.shape[1]

# Create model
models = [create_custom_model(n_features, n_classes, nodes, n, 'model_{}'.format(n)) 
          for n in range(layer_1_1, layer_1_2)]

# for create_model in models:
#     create_model().summary()


# Train and save
history_dict = {}

# TensorBoard Callback
cb = TensorBoard()

for create_model in models:
    model = create_model()
    history_callback = model.fit(X_train, y_train,
                                 batch_size=bs,
                                 epochs=epochs,
                                 verbose=0,
                                 callbacks=[cb])
    score = model.evaluate(X_test, y_test, verbose=2)

    history_dict[model.name] = [history_callback, model]

In [None]:
pred = []

for model_name in history_dict:
    model = history_dict[model_name][1]
    Y_pred = model.predict(X_test)
    fpr, tpr, threshold = roc_curve(y_test.ravel(), Y_pred.ravel())

    pred.append(Y_pred)

x = []
for o in range(layer_1_2-layer_1_1):
    x.append(pred[o].argmax(axis=-1))

true = enc.inverse_transform(y_test)

classification_repo(true, x)

conf_mat(true, x)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot([0, 1], [0, 1], 'k--')

pred = []

for model_name in history_dict:
    model = history_dict[model_name][1]
    Y_pred = model.predict(X_test)
    fpr, tpr, threshold = roc_curve(y_test.ravel(), Y_pred.ravel())
    
    pred.append(Y_pred)
    
    plt.plot(fpr, tpr, label='{}, AUC = {:.3f}'.format(model_name, auc(fpr, tpr)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend();

# pred = np.array(pred)
# print(pred[0].shape)

In [None]:
classify = 'LDA' # 'KNN' or 'LDA'
scorings = ['balanced_accuracy', 'roc_auc_ovr', 'roc_auc_ovo', 'accuracy', 'recall']
scoring = scorings[0]

In [None]:
# select classifier
if classify == 'KNN':
    clf = KNeighborsClassifier()
else:
    clf = LinearDiscriminantAnalysis()

In [None]:
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, stratify=Y)

# X_test_full = X_test

# # Remove index
# X_train = X_train.drop('index', axis=1)
# X_test = X_test.drop('index', axis=1)

# # Normalize numeric features
# scaler = StandardScaler()
# # scaler = MinMaxScaler()
# select = {}
# select[0] = pd.DataFrame(scaler.fit_transform(X_train))
# select[1] = pd.DataFrame(scaler.transform(X_test))
# select[2] = Y_train

# # Feature selection
# select, fs, idx = select_features(select, feature, NN = False)
# X_train = select[0]
# X_test = select[1]


# KNN
if classify == 'KNN':
    k_range = list(range(1,feature))
    weight_options = ["uniform", "distance"]
    grid_params = dict(n_neighbors = k_range, weights = weight_options)
    cv = 5

# LDA
else:
#     cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    cv = 5
    grid_params = dict()
    grid_params['solver'] = ['svd', 'lsqr', 'eigen']

grid = GridSearchCV(clf, grid_params, cv = cv, scoring = scoring, verbose = 1, n_jobs = -1)
grid_results = grid.fit(X_train, Y_train)
cv_scores = cross_val_score(grid_results.best_estimator_, X_train, Y_train, cv=5)

clf = grid_results.best_estimator_
clf = clf.fit(X_train, Y_train)
probs = clf.predict(X_test)

print(classification_report(Y_test, probs, zero_division = 0))
cnf_matrix = pd.DataFrame(confusion_matrix(Y_test, probs))
sns.heatmap(cnf_matrix, annot=True)
plt.show()

# Final models

In [None]:
# from numpy.random import seed
# seed(67)
# import tensorflow as tf
# tf.random.set_seed(38)
# from sklearn.impute import SimpleImputer, KNNImputer

# feature = 30
# n_classes = 2

# # Save original data set
# original = X

# # Split into training and testing sets
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, stratify = Y, random_state = 50)
# X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.15, stratify = Y_train)

In [None]:
# # Imputation
# Train = X_train.copy()
# Train.loc[:,'c4'] = Y_train

# datasets= {}
# by_class = Train.groupby('c4')
# for groups, data in by_class:
#     datasets[groups] = data

# X_train_0 = datasets[0][datasets[0].columns.difference(['c4'])]
# Y_train_0 = datasets[0].loc[:,'c4'].copy()
# X_train_1 = datasets[1][datasets[1].columns.difference(['c4'])]
# Y_train_1 = datasets[1].loc[:,'c4'].copy()

# imp_0 = SimpleImputer(strategy = "most_frequent")
# X_train_0 = imp_0.fit_transform(X_train_0)

# imp_1 = SimpleImputer(strategy = "most_frequent")
# X_train_1 = imp_1.fit_transform(X_train_1)

# X_train = np.concatenate((X_train_0,X_train_1))
# Y_train = np.concatenate((Y_train_0,Y_train_1))

# # Remove NA from test
# Test = X_test.copy()
# Test.loc[:,'c4'] = Y_test
# Test = Test.dropna()
# X_test = Test[Test.columns.difference(['c4'])]
# Y_test = Test.loc[:,'c4'].copy()
# print('Samples in test set after seperated select:',Counter(Y_test))

# # Normalize numeric features
# scaler = StandardScaler()
# # scaler = MinMaxScaler()
# select = {}
# select[0] = pd.DataFrame(scaler.fit_transform(X_train))
# select[1] = Y_train
# select[2] = pd.DataFrame(scaler.transform(X_test))
# select[3] = pd.DataFrame(scaler.transform(X_val))

# # Feature selection
# select, fs, idx = select_features(select, feature, NN = True)
# X_train = pd.DataFrame(select[0])
# X_test = pd.DataFrame(select[1])
# X_val = pd.DataFrame(select[2])

# # Get columns to keep and create new dataframe with those only
# cols = fs.get_support(indices=True)
# features_df_new = original.iloc[:,cols]
# pprint(features_df_new.columns)

In [None]:
feature_columns = fs.get_support(indices=True)
features_df_new = original.iloc[:,feature_columns]
# pprint(features_df_new.columns)
myfeatures = features_df_new.columns

X_train = pd.DataFrame(X_train)
X_train.columns = myfeatures
X_test = pd.DataFrame(X_test)
X_test.columns = myfeatures
# X_val = pd.DataFrame(X_val)
# X_val.columns = myfeatures

feature_columns = []
for feature_name in X_train:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float64))

def make_input_fn(data_df, label_df, num_epochs=100, shuffle=False, batch_size=8):
    def input_function():
        # print(dict(data_df))
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function
train_input_fn = make_input_fn(X_train, Y_train)
# eval_input_fn = make_input_fn(X_val, Y_val)
test_input_fn = make_input_fn(X_test, Y_test, num_epochs=1, shuffle=False)

# optimizer = 'Ftrl' SGD
classifier = tf.estimator.LinearClassifier(
    feature_columns, model_dir=None, n_classes=n_classes, weight_column=None,
    label_vocabulary=None, optimizer='SGD', config=None,
    warm_start_from=None,
    loss_reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE,
    sparse_combiner='sum'
)

In [None]:
classifier.train(train_input_fn)
# validation = classifier.evaluate(eval_input_fn)

In [None]:
# validation

In [None]:
pred_dicts = classifier.predict(test_input_fn)

# probs = pd.Series([pred['probabilities'] for pred in pred_dicts])
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])
# Y_pred = np.round(probs)

In [None]:
Y_pred = []
for prob in probs:
    if prob > 0.5: # 0.30 with 50 features #0.27 #prob[1] for full prob
        Y_pred.append(1)
    else:
        Y_pred.append(0)
            
print(Counter(Y_pred))
print(Counter(Y_test))

print(classification_report(Y_test,Y_pred))
confusion_matrix_df = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix_df)
cnf_matrix = pd.DataFrame(confusion_matrix(Y_test, Y_pred))
sns.heatmap(cnf_matrix, annot=True)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print(precision_score(Y_test,Y_pred, average='macro'))
print(precision_score(Y_test,Y_pred, average='micro'))
print(precision_score(Y_test,Y_pred, average='weighted'))
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test,Y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y_test,Y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y_test,Y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test,Y_pred)
print('F1 score: %f' % f1)

In [None]:
plt.figure(figsize=(10, 10))
plt.plot([0, 1], [0, 1], 'k--')

fpr, tpr, threshold = roc_curve(Y_test.ravel(), probs.ravel())
    
plt.plot(fpr, tpr, label='{}, AUC = {:.3f}'.format('LinearClassifier', auc(fpr, tpr)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend();

In [None]:
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(Y_test, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,)

In [None]:
# estimator =tf.estimator.DNNLinearCombinedClassifier(
#     model_dir=None, linear_feature_columns=feature_columns, linear_optimizer='SGD',
#     dnn_feature_columns=None, dnn_optimizer='Adagrad',
#     dnn_hidden_units=None, dnn_activation_fn=tf.nn.relu, dnn_dropout=None,
#     n_classes=2, weight_column=None, label_vocabulary=None, config=None,
#     warm_start_from=None,
#     loss_reduction=tf.compat.v2.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
#     batch_norm=False, linear_sparse_combiner='sum'
# ) sparse_softmax_cross_entropy_with_logits

estimator = tf.estimator.DNNClassifier(
    hidden_units = [1024, 512, 256], feature_columns = feature_columns, 
    model_dir=None, n_classes=n_classes, weight_column=None,
    label_vocabulary=None, optimizer='SGD', activation_fn=tf.nn.relu,
    dropout=None, config=None, warm_start_from=None,
    loss_reduction=tf.compat.v2.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
    batch_norm=False
)

estimator.train(input_fn=train_input_fn)
# validation = estimator.evaluate(input_fn=eval_input_fn)

In [None]:
# validation

In [None]:
predictions = estimator.predict(input_fn=test_input_fn)
probs = pd.Series([pred['probabilities'][1] for pred in predictions])
# probs = np.array(probs)
# Y_pred = np.round(probs)

In [None]:
Y_pred = []
for i, prob in enumerate(probs):
    if prob > 0.5: # 0.30 with 50 features #0.27
        Y_pred.append(1)
    else:
        Y_pred.append(0)

print(Counter(Y_pred))
print(Counter(Y_test))

print(classification_report(Y_test,Y_pred))
confusion_matrix_df = confusion_matrix(Y_test, Y_pred)
print(confusion_matrix_df)
cnf_matrix = pd.DataFrame(confusion_matrix(Y_test, Y_pred))
sns.heatmap(cnf_matrix, annot=True)
plt.show()

In [None]:
plt.figure(figsize=(10, 10))
plt.plot([0, 1], [0, 1], 'k--')

fpr, tpr, threshold = roc_curve(Y_test.ravel(), probs.ravel())
    
plt.plot(fpr, tpr, label='{}, AUC = {:.3f}'.format('DNN', auc(fpr, tpr)))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend();

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

print(precision_score(Y_test,Y_pred, average='macro'))
print(precision_score(Y_test,Y_pred, average='micro'))
print(precision_score(Y_test,Y_pred, average='weighted'))
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test,Y_pred)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y_test,Y_pred)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y_test,Y_pred)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test,Y_pred)
print('F1 score: %f' % f1)

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print(classification_report(Y_test,Y_pred))
confusion_matrix_df = confusion_matrix(Y_test, Y_pred)
confusion_matrix_df

In [None]:
# wrongs_1 = np.load('wrongs_1.npy', allow_pickle = 'TRUE').item()
# wrongs_2 = np.load('wrongs_2.npy', allow_pickle = 'TRUE').item()
# one = wrongs_1.copy()
# two = dict(list(wrongs_2.items())[0:160])

# full_1 = []
# for i in range(len(one)):
# #     print(one[i])
#     full_1 = set(full_1).union(set(one[i]))
# # print(len(full_1))
# # print(full_1)

# full_2 = []
# for i in range(len(two)):
#     full_2 = set(full_2).union(set(two[i]))
# # print(len(full_2))
# # print(full_2)

# common = set(full_1).intersection(set(full_2))
# print(len(common))

# subset_c = data.drop(common, axis = 0)
# subset_w = data.loc[common]
# index_c1 = subset_w.index

# print('All', Counter(data['c1']))
# print('Correct', Counter(subset_c['c1']))
# print('Incorrect', Counter(subset_w['c1']))