In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot as plt
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from scipy import stats

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Classification/speech')

In [None]:
data = pd.read_excel('classification.xlsx', engine='openpyxl')

In [None]:
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
groups = np.array(data['Subject_ID'])

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X,y)

In [None]:
X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=0)
group_k_fold = GroupKFold(n_splits=6)
splits = group_k_fold.split(X_shuffled, y_shuffled, groups_shuffled)

In [None]:
model = SVC(kernel='linear')

In [None]:
indices = []
decision = []
label = []
predictions = []
for train_index, test_index in group_k_fold.split(X_shuffled, y_shuffled, groups_shuffled):
    model.fit(X_shuffled[train_index], y_shuffled[train_index])
    predictions.append(model.predict(X_shuffled[test_index]))
    indices.append(groups_shuffled[test_index])
    decision.append(model.decision_function(X_shuffled[test_index]))
    label.append(y_shuffled[test_index])

In [None]:
scores = cross_val_score(model, X_shuffled, y_shuffled, scoring='accuracy', cv=splits, groups = groups_shuffled, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

In [None]:
k=0
decision_scores = list()
while k<6:
    for x, y,z,w in zip(indices[k], label[k], predictions[k], decision[k]):
        decision_scores.append((x,y,z,w))
    k = k+1

In [None]:
df = pd.DataFrame(decision_scores, columns =['s_id', 'label', 'prediction', 'score'])


In [None]:
df['disagreement'] = df[['s_id','prediction']].duplicated(keep=False)
df.sort_values(by='s_id')

In [None]:
df = df.drop([102, 73, 83, 44, 47])

In [None]:
df

In [None]:
disagree = df.loc[df['disagreement']==False].sort_values(by='s_id')

In [None]:
disagreement = [key for key in  dict(disagree.groupby('s_id').groups)]


In [None]:
len(disagreement)

In [None]:
# let's look at the individual classification now
os.chdir('/Users/andrei-macpro/Documents/Data/classification/speech')

In [None]:
data = pd.read_excel('features.xlsx', sheet_name="Meal", engine='openpyxl')

In [None]:
subject_id = np.array(data["Subject_ID"])

In [None]:
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
X = scaler.fit_transform(X,y)

In [None]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
splits = k_fold.split(X,y)

In [None]:
predictions = list()
labels = []
subjects = []
for train_indexes, test_indexes in splits:
        model.fit(X[train_indexes], y[train_indexes])
        predictions.append(model.predict(X[test_indexes]))
        labels.append(y[test_indexes])
        subjects.append(subject_id[test_indexes])
#meal = pd.DataFrame(list(zip(subjects, labels, predictions)), columns = ['subject_id', 'label', 'prediction'])

In [None]:
meal = []
k=0
while k<5:
    for x, y,z in zip(subjects[k], labels[k], predictions[k]):
        meal.append((x,y,z))
    k = k+1

In [None]:
meal = pd.DataFrame(meal, columns =['subject_id', 'label', 'prediction'])

In [None]:
subjects_reshaped.shape

In [None]:
n_shuffles = 10
predictions = list()
labels = []
decisions = []
indices = []
for i in range(n_shuffles):
    k_fold = KFold(n_splits=5, shuffle=True, random_state=i)
    split = k_fold.split(X,y)
    for train_indexes, test_indexes in split:
        model.fit(X[train_indexes], y[train_indexes])
        predictions.append(model.predict(X[test_indexes]))
        labels.append(y[test_indexes])
        indices.append(test_indexes)
        decisions.append(model.decision_function(X[test_indexes]))

In [None]:
decision_scores =[]
k=0
while k<50:
    for x, y,z,w in zip(indices[k], labels[k], predictions[k], decisions[k]):
        decision_scores.append((x,y,z,w))
    k = k+1

In [None]:
decisions_meal = pd.DataFrame(decision_scores, columns =['s_id', 'label', 'prediction', 'decision_score'])
decisions_meal

In [None]:
decisions_meal = decisions_meal.sort_values(by='s_id')
decisions_meal
final_decisions = decisions_meal.groupby('s_id').agg('mean')

In [None]:
predictions_meal

In [None]:
predictions_meal = decisions_meal.groupby('s_id').sum()
predictions_meal = predictions_meal.set_index(data['Subject_ID'])

In [None]:
# try to check where there is a lot of variability 
missclassified_meal = [] # participants who tend to alternate between neg and pos in those 10 iterations 
for index, row in predictions_meal.iterrows():
    if row['prediction'] >3 and row['prediction'] <8:
        missclassified_meal.append(index)

In [None]:
final_decisions = final_decisions.drop(columns =['prediction'])

In [None]:
final_decisions

In [None]:
accuracy 
for index, row in final_decisions.iterrows():
    if row['decision_score']>0 and row['label'] ==1:
        accuracy +=1
    elif row['decision_score']<0 and row['label'] ==0:
        accuracy +=1
        

In [None]:
prediction = []
for index, row in final_decisions.iterrows():
    if row['decision_score']>0:
        prediction.append(1)
    elif row['decision_score']<0:
        prediction.append(0)

In [None]:
final_decisions['predictions'] = prediction

In [None]:
data = pd.read_excel('features.xlsx', sheet_name="Play", engine='openpyxl')

In [None]:
subject_id = np.array(data["Subject_ID"])

In [None]:
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
X = scaler.fit_transform(X,y)

In [None]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
splits = k_fold.split(X,y)

In [None]:
predictions = list()
labels = []
subjects = []
for train_indexes, test_indexes in splits:
        model.fit(X[train_indexes], y[train_indexes])
        predictions.append(model.predict(X[test_indexes]))
        labels.append(y[test_indexes])
        subjects.append(subject_id[test_indexes])

In [None]:
subjects

In [None]:
play = []
k=0
while k<5:
    for x, y,z in zip(subjects[k], labels[k], predictions[k]):
        play.append((x,y,z))
    k = k+1

In [None]:
play

In [None]:
play = pd.DataFrame(play, columns =['subject_id', 'label', 'prediction'])

In [None]:
n_shuffles = 10
predictions = list()
labels = []
decisions = []
indices = []
for i in range(n_shuffles):
    k_fold = KFold(n_splits=5, shuffle=True, random_state=i)
    split = k_fold.split(X,y)
    for train_indexes, test_indexes in split:
        model.fit(X[train_indexes], y[train_indexes])
        predictions.append(model.predict(X[test_indexes]))
        labels.append(y[test_indexes])
        indices.append(test_indexes)
        decisions.append(model.decision_function(X[test_indexes]))

In [None]:
decision_scores =[]
k=0
while k<50:
    for x, y,z,w in zip(indices[k], labels[k], predictions[k], decisions[k]):
        decision_scores.append((x,y,z,w))
    k = k+1
    
decisions_play = pd.DataFrame(decision_scores, columns =['s_id', 'label', 'prediction', 'decision_score'])


In [None]:
decisions_play = decisions_play.sort_values(by='s_id')
#final_decisions_play = decisions_play.groupby('s_id').agg('mean')

In [None]:
predictions_play = decisions_play.groupby('s_id').sum()
predictions_play = predictions_play.set_index(data['Subject_ID'])

In [None]:
missclassified_play = [] # participants who tend to alternate between neg and pos in those 10 iterations 
# between 40 and 70% of the times tend to be misclassified 
for index, row in predictions_play.iterrows():
    if row['prediction'] >3 and row['prediction'] <8:
        missclassified_play.append(index)

In [None]:
missclassified_play, missclassified_meal

In [None]:
predictions_play.loc[1134], predictions_meal.loc[1134]

In [None]:
final_decisions_play = final_decisions_play.drop(columns =['prediction'])

In [None]:
accuracy = 0
for index, row in final_decisions_play.iterrows():
    if row['decision_score']>0 and row['label'] ==1:
        accuracy +=1
    elif row['decision_score']<0 and row['label'] ==0:
        accuracy +=1

In [None]:
accuracy/len(final_decisions_play)

In [None]:
predictions = []
for index, row in final_decisions_play.iterrows():
    if row['decision_score']>0:
        predictions.append(1)
    elif row['decision_score']<0:
        predictions.append(0)
final_decisions_play['predictions'] = predictions

In [None]:
final_decisions_play  = final_decisions_play.set_index(data['Subject_ID'])


In [None]:
final_decisions_play= final_decisions_play.rename(columns={"label":"label_play", "decision_score":"decision score play","predictions":"predictions play" })

In [None]:
final_decisions_meal = final_decisions.set_index(data['Subject_ID'])

In [None]:
final_decisions_meal= final_decisions_meal.rename(columns={"label":"label_meal", "decision_score":"decision score meal","predictions":"predictions meal" })

In [None]:
all_decisions = pd.concat([final_decisions_play, final_decisions_meal], axis=1)

In [None]:
all_decisions['agreement'] = all_decisions[['predictions play','predictions meal']].duplicated(keep=False)


In [None]:
final_pred = []
for index, row in all_decisions.iterrows():
    if abs(row['decision score play']) > abs(row['decision score meal']):
        final_pred.append(row['predictions play'])
    else:
        final_pred.append(row['predictions meal'])
all_decisions['final pred'] = final_pred

In [None]:
all_decisions

In [None]:
final_agreement = 0
for index, row in all_decisions.iterrows():
    if row['final pred'] == row['label_meal']:
        final_agreement +=1
final_agreement

In [None]:
# make a df of people who the classifier finds hard to put label on
import itertools
nest = [disagreement,missclassified_play, missclassified_meal]
confusing = pd.DataFrame((_ for _ in itertools.zip_longest(*nest)), columns=['confused_general', 'confused_play', 'confused_meal'])


In [None]:
unique = pd.unique(confusing[['confused_general', 'confused_play', 'confused_meal']].values.ravel('K'))
len(unique)
# 38 people are unique out of the confusing ones (so they only appear in one column) 
# 44 in total 

In [None]:
confusing # need to get some descriptive statistics on these 
# look at the predictions for these for both play and meal 

In [None]:
all_decisions.loc[missclassified_play]

In [None]:
all_decisions.loc[missclassified_meal]

# let's combine the classifiers based on errors 

In [None]:
meal = meal.set_index(meal['subject_id'])
play = play.set_index(play['subject_id'])


In [None]:
meal = meal.rename(columns = {'prediction' :'prediction_meal'})
play = play.rename(columns = {'prediction' :'prediction_play'}).drop('label', axis=1)

In [None]:
combined = pd.concat([meal, play], axis=1)

In [None]:
combined = combined.drop('subject_id', axis=1)

In [None]:
combined = combined.drop([1049, 1089, 1250, 1096, 1195])

In [None]:
final_pred = []
for index, row, in combined.iterrows():
    if row['prediction_meal'] ==1 and row['prediction_play'] ==0:
        final_pred.append(0)
    elif row['prediction_meal'] ==0 and row['prediction_play'] ==1:
        final_pred.append(0)
    elif row['prediction_meal'] ==0 and row['prediction_play'] ==0:
        final_pred.append(0)
    elif row['prediction_meal'] ==1 and row['prediction_play'] ==1:
        final_pred.append(1)
combined['final_pred'] = final_pred
combined

In [None]:
accuracy = 0
for index, row, in combined.iterrows():
    if row['label'] == row['final_pred']:
        accuracy += 1
accuracy

In [None]:
accuracy/len(combined)

In [None]:
data = pd.read_excel('features.xlsx', sheet_name="Meal", engine='openpyxl')
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])
X = scaler.fit_transform(X,y)

k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
splits = k_fold.split(X,y)
subject_id = np.array(data["Subject_ID"])
predictions = list()
labels = []
subjects = []
for train_indexes, test_indexes in splits:
        model.fit(X[train_indexes], y[train_indexes])
        predictions.append(model.predict(X[test_indexes]))
        labels.append(y[test_indexes])
        subjects.append(subject_id[test_indexes])
        
meal = []
k=0
while k<5:
    for x, y,z in zip(subjects[k], labels[k], predictions[k]):
        meal.append((x,y,z))
    k = k+1
    
    
meal = pd.DataFrame(meal, columns =['subject_id', 'label', 'prediction'])
meal = meal.set_index(meal['subject_id']).rename(columns = {'prediction' :'prediction_meal'})

In [None]:
data = pd.read_excel('features.xlsx', sheet_name="Play", engine='openpyxl')
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])
X = scaler.fit_transform(X,y)
subject_id = np.array(data["Subject_ID"])
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)
splits = k_fold.split(X,y)

predictions = list()
labels = []
subjects = []
for train_indexes, test_indexes in splits:
        model.fit(X[train_indexes], y[train_indexes])
        predictions.append(model.predict(X[test_indexes]))
        labels.append(y[test_indexes])
        subjects.append(subject_id[test_indexes])
play = []
k=0
while k<5:
    for x, y,z in zip(subjects[k], labels[k], predictions[k]):
        play.append((x,y,z))
    k = k+1
    
    
play = pd.DataFrame(play, columns =['subject_id', 'label', 'prediction'])
play = play.set_index(play['subject_id']).rename(columns = {'prediction' :'prediction_play'}).drop('label', axis=1)

In [None]:
combined = pd.concat([meal, play], axis=1)
combined = combined.drop([1049, 1089, 1250, 1096, 1195], axis=0)
combined = combined.drop('subject_id', axis=1)
final_pred = []
for index, row, in combined.iterrows():
    if row['prediction_meal'] ==1 and row['prediction_play'] ==0:
        final_pred.append(0)
    elif row['prediction_meal'] ==0 and row['prediction_play'] ==1:
        final_pred.append(0)
    elif row['prediction_meal'] ==0 and row['prediction_play'] ==0:
        final_pred.append(0)
    elif row['prediction_meal'] ==1 and row['prediction_play'] ==1:
        final_pred.append(1)
   # elif row['prediction_meal'] ==1 and row['prediction_play'] =='NaN' and row['label']==1:
    #    final_pred.append(1)
    #elif row['prediction_meal'] ==0 and row['prediction_play'] =='NaN' and row['label']==0:
     #   final_pred.append(1)
   # elif row['prediction_meal'] =='NaN' and row['prediction_play'] ==1 and row['label']==1:
    #    final_pred.append(1)
   # elif row['prediction_meal'] =='NaN' and row['prediction_play'] ==0 and row['label']==0:
    #    final_pred.append(1)
   # else:
    #    final_pred.append(0)
        
        
        
combined['final_pred'] = final_pred

accuracy = []
for index, row, in combined.iterrows():
    if row['label'] == row['final_pred']:
        accuracy.append(1)
sum(accuracy)

In [None]:
sum(accuracy)/len(combined)



In [None]:
accuracies = [62.5, 64.2, 58.9, 62.5,58.9, 58.9, 62.5,58.9, 60.7,62.5]
mean(accuracies)

In [None]:
std(accuracies)

In [None]:
#explore who tends to get the most errors in final pred 
errors = []
for index, row in combined.iterrows():
    if row['label'] != row['final_pred']:
        errors.append(index)

In [None]:
errors = np.array(errors)
errors = errors.tolist()

In [None]:
errors_unique = np.array(errors)
errors_unique = np.unique(errors_unique)
errors_unique

In [None]:
no_errors = []
for error in errors_unique:
    no_errors.append(errors.count(error))

In [None]:
misclass = pd.DataFrame(list(zip(errors_unique, no_errors)), columns = ['s_id', 'misclassifications'])

misclass = misclass.loc[misclass['misclassifications'] > 4] # more than half the time

In [None]:
misclass