In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Classification/speech')

In [None]:
data = pd.read_excel('classification.xlsx', engine='openpyxl')

In [None]:
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
groups = np.array(data['Subject_ID'])

In [None]:
scaler = StandardScaler()
print(scaler.fit(X))

In [None]:
print(scaler.mean_)

In [None]:
X = scaler.fit_transform(X,y)

In [None]:
X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=1)
group_k_fold = GroupKFold(n_splits=6)
splits = group_k_fold.split(X_shuffled, y_shuffled, groups_shuffled)

In [None]:
for train_index, test_index in splits:
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_shuffled[train_index], X_shuffled[test_index]
    y_train, y_test = y_shuffled[train_index], y_shuffled[test_index]
    print(X_train, X_test, y_train, y_test)

In [None]:
for train_index, test_index in group_k_fold.split(X_shuffled, y_shuffled, groups_shuffled):
    print(np.unique(y_shuffled[test_index], return_counts=True))

In [None]:
model = SVC(kernel='linear')


In [None]:
scores = cross_val_score(model, X_shuffled, y_shuffled, scoring='accuracy', cv=group_k_fold, n_jobs=-1, groups=groups_shuffled)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
labels=list()
for train_index, test_index in group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled):
    labels.append(y_shuffled[test_index])

In [None]:
predictions=list()
coefs = list()
for train_index, test_index in group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled):
    model.fit(X_shuffled[train_index], y_shuffled[train_index])
    temp_list = model.predict(X_shuffled[test_index])
    coefs.append(model.coef_)
    predictions.append(temp_list)

In [None]:
coefs = np.array(coefs)

In [None]:
avg_coefs = coefs.mean(axis=0)

In [None]:
coefs.shape

In [None]:
accuracy=0
for x,y in zip(predictions,labels):
    for prediction, label in zip(x,y):
        if int(prediction)==int(label):
            accuracy +=1
accuracy

In [None]:
len(X)

In [None]:
accuracy/len(X)*100

In [None]:
scores_recall = cross_val_score(model, X_shuffled, y_shuffled, scoring='recall', cv=group_k_fold, n_jobs=-1, groups=groups_shuffled)
print('recall: %.3f (%.3f)' % (mean(scores_recall), std(scores_recall)))

In [None]:
scores_precision = cross_val_score(model, X_shuffled, y_shuffled, scoring='precision', cv=group_k_fold, n_jobs=-1, groups=groups_shuffled)
print('precision: %.3f (%.3f)' % (mean(scores_precision), std(scores_precision)))

In [None]:
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()

In [None]:
features_names = [data.iloc[:,1:12].columns]

In [None]:
pd.Series(np.transpose(abs(avg_coefs[0])), index=features_names[0]).nlargest(12).plot(kind='barh', figsize=(10,10))

In [None]:
features_names[0]

In [None]:
titles_options = [("Confusion matrix, without normalization", None),
                  ("Normalized confusion matrix", 'true')]

In [None]:
cm_holder = []
for train_index, test_index in group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled):
    model.fit(X_shuffled[train_index], y_shuffled[train_index])
    print(confusion_matrix(y_shuffled[test_index], model.predict(X_shuffled[test_index])))
    cm_holder.append(confusion_matrix(y_shuffled[test_index], model.predict(X_shuffled[test_index])))

In [None]:
cm_holder

In [None]:
final_cm = sum(cm_holder)

In [None]:
import itertools 
from matplotlib.pyplot import figure

figure(figsize=(8, 6), dpi=80)
plt.imshow(final_cm,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_shuffled[test_index]))) # length of classes
class_labels = ['no rad','rad']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = final_cm.max() / 2.
for i,j in itertools.product(range(final_cm.shape[0]),range(final_cm.shape[1])):
    plt.text(j,i,format(final_cm[i,j],'d'),horizontalalignment='center',color='white' if final_cm[i,j] >thresh else 'black')
plt.show();

In [None]:
true_positives=0
false_positives=0
true_negatives=0
for x,y in zip(labels,predictions):
    for label, prediction in zip(x,y):
        if int(label)==int(prediction)==1:
            true_positives +=1

for x,y in zip(labels,predictions):
    for label, prediction in zip(x,y):
        if int(label)==int(prediction)==0:
            true_negatives +=1



In [None]:
true_positives

In [None]:
true_negatives

In [None]:
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
# code from https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(10,10))
for i, (train, test) in enumerate(group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled)):
    model.fit(X_shuffled[train], y_shuffled[train])
    viz = plot_roc_curve(model, X_shuffled[test], y_shuffled[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic")
ax.legend(loc="lower right")
ax.xaxis.label.set_size(20)
ax.yaxis.label.set_size(20)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
list_indexes_train = list()
list_indexes_test =list()
temp_list = list()
predictions = list()
y_test_index = list()

for train_index, test_index in group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled):
    model.fit(X_shuffled[train_index], y_shuffled[train_index])
    temp_list = model.predict(X_shuffled[test_index])
    temp_list2=list()
    temp_list3=list()
    temp_list2.append(groups_shuffled[test_index])
    temp_list3.append(y_shuffled[test_index])
    y_test_index.append(temp_list3)
    list_indexes_test.append(temp_list2)
    #coefs.append(model.coef_)
    predictions.append(temp_list)

    

In [None]:
test_indexes = list()
for train_index, test_index in group_k_fold.split(X_shuffled, y=y_shuffled, groups=groups_shuffled):
    temp=list()
    temp.append(test_index)
    test_indexes.append(temp)

In [None]:
y_test_index[0][0]

In [None]:
k=0
tuple_index_pred = list()
while k<6:
    for x, y,z in zip(list_indexes_test[k][0], predictions[k], y_test_index[k][0]):
        tuple_index_pred.append((x,y,z))
    k = k+1

In [None]:
tuple_index_pred

In [None]:
t_list = list()
t_list2 = list()
t_list3=list()
for x in tuple_index_pred:
    t_list.append(x[0])
    t_list2.append(x[1])
    t_list3.append(x[2])

final_df = pd.DataFrame(list(zip(t_list, t_list2, t_list3)), columns = ['index', 'label', 'ground_truth'])
final_df

In [None]:
y_test_index

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(final_df)

In [None]:
final_df.loc[final_df['label']==1].duplicated().describe() #46 with False and 19 with True so 19 people have both 1

In [None]:
final_df.loc[final_df['label']==0].duplicated().describe() #38 with False and 14 with True so 14 people have both 0

In [None]:
# out of 56 we have 12 that have both recordings correct and 44 where either both are or only one is incorrect
# ok so 3 measures: how many where both are correct, how many where both are incorrect, how many where only one is correct
# so 12 where both recordings are correct, but how many where both are incorrect? 
# and how many where only one is? 


In [None]:
# 1. discard all people where there is a disagree in label 
final_df['discarded'] = final_df[['index','label']].duplicated(keep=False) # duplicated means True; disagree in label means false

In [None]:
# keep only those where there is agree  in label ; so duplicated ones 
final_df.loc[final_df['discarded']==True] # so we have 66 where there is agreement in label in the classifier output
# and 51 where there isn't
# so 66 with both recordings in agreement; so actually 33 people out of the 56 that have both recordings
# so the rest of 23 people we leave to clinician

In [None]:
final_df.loc[final_df['discarded']==False]

In [None]:
df_disagree = final_df.loc[final_df['discarded']==False]
len(df_disagree) -5 = 46

In [None]:
df_disagree= df_disagree.drop([102, 73, 83, 44, 47])

In [None]:
len(df_disagree.loc[df_disagree['ground_truth']==0])  # 22 out of 46 
# 11 out of 23 children for whom there was a disagree actually got a no_rad

In [None]:
df_disagree['label'].value_counts()

In [None]:
# now calculate accuracy over these
final_discarded = final_df.loc[final_df['discarded']==True]

In [None]:
final_discarded[final_discarded['ground_truth']==1].describe()

In [None]:
accuracy=0
for x,y in zip(final_discarded['label'], final_discarded['ground_truth']):
        if x==y:
            accuracy +=1
accuracy

In [None]:
fp = 0
fn = 0
tp = 0
for x,y in zip(final_discarded['label'], final_discarded['ground_truth']):
        if x!=y and x==1:
            fp +=1
        if x!=y and x==0:
            fn +=1
        if x==y and x==1:
            tp +=1

In [None]:
accuracy/66*100 # out of those that there's agreement 69% accuracy 

In [None]:
accuracy # chi square value is 2.77 (significant at p<.10)

In [None]:
# what would we expect if it was random: 
final_discarded['ground_truth'].value_counts() # 18 people with no rad and 15 with rad out of those we keep 

In [None]:
final_discarded['ground_truth'].value_counts()

In [None]:
fp, fn, tp

In [None]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
recall, precision

In [None]:
# now plot new cm and maybe ROC 
# confusion matrix at person level 
y_true = np.array(final_discarded['ground_truth'])
y_pred = np.array(final_discarded['label'])

In [None]:
cm_person = confusion_matrix(y_true, y_pred)

In [None]:
figure(figsize=(8, 6), dpi=80)
plt.imshow(cm_person,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix Person level')
plt.xlabel('Predicted')
plt.ylabel('Actual')
tick_marks = np.arange(len(set(y_shuffled[test_index]))) # length of classes
class_labels = ['no rad','rad']
tick_marks
plt.xticks(tick_marks,class_labels)
plt.yticks(tick_marks,class_labels)
# plotting text value inside cells
thresh = cm_person.max() / 2.
for i,j in itertools.product(range(cm_person.shape[0]),range(cm_person.shape[1])):
    plt.text(j,i,format(cm_person[i,j],'d'),horizontalalignment='center',color='white' if cm_person[i,j] >thresh else 'black')
plt.show();

In [None]:
from statsmodels.stats.multitest import multipletests

In [None]:
pvals = [.378, .122, .161, .379, .721, .910, .367, .892, .979, .381, .007]

In [None]:
multipletests(pvals, method='fdr_bh')