In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from scipy import stats
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Classification/speech')


In [None]:
data = pd.read_excel('features.xlsx', sheet_name="Meal", engine='openpyxl')

In [None]:
subject_id = np.array(data["Subject_ID"])

In [None]:
X = data.iloc[:,1:11].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
classifier = SVC(kernel='linear')
steps = list()
steps.append(('scaler', StandardScaler()))
steps.append(('model', classifier))
pipeline = Pipeline(steps=steps)

In [None]:
cv = KFold(n_splits=5, shuffle=True)
scores = cross_val_score(pipeline, X,y, scoring='accuracy', cv=cv, n_jobs=-1)

In [None]:
print('Accuracy: %.3f (%.3f)' % (mean(scores)*100, std(scores)*100))

In [None]:
#calculate accuracy and coefficients 
accuracies_meal = []
final_coef =[]
for i in range(10):
    kf = KFold(n_splits=5, random_state=i, shuffle=True)
    coefficients_meal = []
    accuracies = []
    for train_index, test_index in kf.split(X,y):

            # scale training and test data based on statistics of only training data
        scaler = StandardScaler()
        X_train_, X_test_ = X[train_index], X[test_index]
        y_train_, y_test_ = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train_)
            # normalize data
        X_train_, X_test_ = scaler.transform(X_train_), scaler.transform(X_test_)
        model = SVC(kernel='linear', probability=True)
        model.fit(X_train_, y_train_)
        print(model.predict(X_test_))

        accuracy = accuracy_score(y_test_, model.predict(X_test_))
        accuracies.append(accuracy)
        print(f"Accuracy {accuracy:3f}")
        print(model.coef_)
        coefficients_meal.append(model.coef_)
    accuracies_meal.append(mean(accuracies))
    final_coef.append(mean(coefficients_meal, axis=1))
    


In [None]:
# getting the predictions 


In [None]:
#calculate precision and recall 
recall_meal = []
precision_meal=[]
f_score_meal = []
for i in range(10):
    kf = KFold(n_splits=5, random_state=i, shuffle=True)
    recalls = []
    precisions=[]
    f_scores=[]
    for train_index, test_index in kf.split(X):

            # scale training and test data based on statistics of only training data
        scaler = StandardScaler()
        X_train_, X_test_ = X[train_index], X[test_index]
        y_train_, y_test_ = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train_)
            # normalize data
        X_train_, X_test_ = scaler.transform(X_train_), scaler.transform(X_test_)
        model = SVC(kernel='linear', probability=True)
        model.fit(X_train_, y_train_)

        recall = recall_score(y_test_, model.predict(X_test_))
        precision = precision_score(y_test_, model.predict(X_test_))
        recalls.append(recall)
        precisions.append(precision)
        f1 = f1_score(y_test_, model.predict(X_test_))
        f_scores.append(f1)
        
    recall_meal.append(mean(recalls))
    precision_meal.append(mean(precisions))
    f_score_meal.append(mean(f_scores))
recall_meal, precision_meal
    

In [None]:
mean(f_score_meal), statistics.stdev(f_score_meal)

In [None]:
mean(recall_meal), statistics.stdev(recall_meal)

In [None]:
mean(precision_meal), statistics.stdev(precision_meal)

In [None]:
mean(mean(final_coef, axis=1), axis=1)

In [None]:
meal_coefficients = mean([mean(abs(coef), axis=1) for coef in final_coef], axis=1)
meal_coefficients

In [None]:
iterations_coefs = np.array([mean(abs(coef), axis=1) for coef in final_coef])


In [None]:
std_coefs = np.std(iterations_coefs, axis=1)
std_coefs

In [None]:
mean(accuracies_meal)

In [None]:
import statistics
statistics.stdev(accuracies_meal)

In [None]:
len(meal_coefficients)

In [None]:
features_names_meal = [data.iloc[:,1:11].columns]
features_names_meal

In [None]:
features_names_meal[0].T

In [None]:
meal_features_imp = pd.DataFrame(list(zip(features_names_meal[0], meal_coefficients, std_coefs)), columns = ['feature', 'coefficient', 'std'])
meal_features_imp

In [None]:
meal_features_imp.T.to_csv('feature_coefs.csv', index=False)


In [None]:
#old
coefs_meal = sum(coefficients_meal)/len(coefficients_meal)
coefs_meal

In [None]:
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.show()
    
features_names_meal = [data.iloc[:,1:12].columns]
pd.Series(np.transpose(abs(coefs_meal[0])), index=features_names_meal[0]).nlargest(12).plot(kind='barh', figsize=(10,10))
#plt.savefig('meal_features.png')

In [None]:
accuracy_meal = []
for i in range(10):
    cv = KFold(n_splits=5, shuffle=True, random_state=i)
    scores = cross_val_score(pipeline, X,y, scoring='accuracy', cv=cv, n_jobs=-1)
    accuracy_meal.append(mean(scores)*100)

mean(accuracy_meal)

In [None]:
recall_meal = []
for i in range(10):
    cv = KFold(n_splits=5, shuffle=True, random_state=i
    scores_recall_meal = cross_val_score(pipeline, X,y, scoring='recall', cv=cv, n_jobs=-1)
    recall_meal.append(mean(scores_recall_meal)*100)
    
mean(recall_meal)

In [None]:
precision_meal = []
for i in range(10):
    cv = KFold(n_splits=5, shuffle=True, random_state=i
    scores_precision_meal = cross_val_score(pipeline, X,y, scoring='precision', cv=cv, n_jobs=-1)
    precision_meal.append(mean(scores_precision_meal)*100)
    
mean(precision_meal)

In [None]:
accuracy_meal

In [None]:
mean(accuracy_meal)

In [None]:
# getting the confusion matrix 

kf = KFold(n_splits=5, random_state=i, shuffle=True)
cm_holder = []
for train_index, test_index in kf.split(X):
        # scale training and test data based on statistics of only training data
        scaler = StandardScaler()
        X_train_, X_test_ = X[train_index], X[test_index]
        y_train_, y_test_ = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train_)
        # normalize data
        X_train_, X_test_ = scaler.transform(X_train_), scaler.transform(X_test_)
        model = SVC(kernel='linear', probability=True)
        model.fit(X_train_, y_train_)
        print(model.predict(X_test_))
        cm_temp = []
        cm_temp.append(confusion_matrix(y_test_, model.predict(X_test_)))
        print(cm_temp)
        accuracy = accuracy_score(y_test_, model.predict(X_test_))
        print(f"Accuracy {accuracy:3f}")
        print(model.coef_)
        coefficients_meal.append(model.coef_)
        cm_holder.append(sum(cm_temp))

In [None]:
matrix_meal= sum(cm_holder)
matrix_meal

In [None]:
import itertools 
from matplotlib.pyplot import figure

figure(figsize=(8, 6), dpi=80)
plt.imshow(matrix_meal,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Meal', fontsize=16)
plt.xlabel('Predicted', fontsize=16)
plt.ylabel('Actual', fontsize=16)
tick_marks = np.arange(len(set(y_test_))) # length of classes
class_labels = ['no rad','rad']
tick_marks
plt.xticks(tick_marks,class_labels, fontsize=16)
plt.yticks(tick_marks,class_labels, fontsize=16)
# plotting text value inside cells
thresh = matrix_meal.max() / 2.
for i,j in itertools.product(range(matrix_meal.shape[0]),range(matrix_meal.shape[1])):
    plt.text(j,i,format(matrix_meal[i,j],'d'),horizontalalignment='center',color='black' if matrix_meal[i,j] >thresh else 'black', fontsize='xx-large')
plt.show();
plt.savefig('matrix_meal.png')

In [None]:
# now let's do it for play
data = pd.read_excel('features.xlsx', sheet_name="Play", engine='openpyxl')
X = data.iloc[:,1:11].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])
subject_id = np.array(data["Subject_ID"])

In [None]:
#calculate accuracy and coefficients 
accuracies_play = []
final_coef =[]
for i in range(10):
    kf = KFold(n_splits=5, random_state=i, shuffle=True)
    coefficients_play = []
    accuracies = []
    for train_index, test_index in kf.split(X):

            # scale training and test data based on statistics of only training data
        scaler = StandardScaler()
        X_train_, X_test_ = X[train_index], X[test_index]
        y_train_, y_test_ = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train_)
            # normalize data
        X_train_, X_test_ = scaler.transform(X_train_), scaler.transform(X_test_)
        model = SVC(kernel='linear', probability=True)
        model.fit(X_train_, y_train_)
        print(model.predict(X_test_))

        accuracy = accuracy_score(y_test_, model.predict(X_test_))
        accuracies.append(accuracy)
        print(f"Accuracy {accuracy:3f}")
        print(model.coef_)
        coefficients_play.append(model.coef_)
    accuracies_play.append(mean(accuracies))
    final_coef.append(mean(coefficients_play, axis=1))

In [None]:
#calculate precision and recall 
recall_play = []
precision_play=[]
f1_play = []
for i in range(10):
    kf = KFold(n_splits=5, random_state=i, shuffle=True)
    recalls = []
    precisions=[]
    f_scores=[]
    for train_index, test_index in kf.split(X):

            # scale training and test data based on statistics of only training data
        scaler = StandardScaler()
        X_train_, X_test_ = X[train_index], X[test_index]
        y_train_, y_test_ = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train_)
            # normalize data
        X_train_, X_test_ = scaler.transform(X_train_), scaler.transform(X_test_)
        model = SVC(kernel='linear', probability=True)
        model.fit(X_train_, y_train_)

        recall = recall_score(y_test_, model.predict(X_test_))
        precision = precision_score(y_test_, model.predict(X_test_))
        recalls.append(recall)
        precisions.append(precision)
        f1 = f1_score(y_test_, model.predict(X_test_))
        f_scores.append(f1)
    recall_play.append(mean(recalls))
    precision_play.append(mean(precisions))
    f1_play.append(mean(f_scores))
recall_play, precision_play
    

In [None]:
mean(recall_play), statistics.stdev(recall_play), mean(precision_play), statistics.stdev(precision_play), mean(accuracies_play), statistics.stdev(accuracies_play)

In [None]:
mean(f1_play), statistics.stdev(f1_play)

In [None]:
play_coefficients = mean([mean(abs(coef), axis=1) for coef in final_coef], axis=1)
iterations_coefs = np.array([mean(abs(coef), axis=1) for coef in final_coef])
std_coefs = np.std(((iterations_coefs)), axis=1)


In [None]:
mean(mean(final_coef, axis=1), axis=1)


In [None]:
play_features_imp = pd.DataFrame(list(zip(features_names_meal[0], play_coefficients, std_coefs)), columns = ['feature', 'coefficient', 'std'])
play_features_imp.T.to_csv('feature_coefs2.csv', index=False)


In [None]:
coefs_play = sum(coefficients_play)/len(coefficients_play)
coefs_play

In [None]:
features_names_play = [data.iloc[:,1:12].columns]
pd.Series(np.transpose(abs(coefs_play[0])), index=features_names_play[0]).nlargest(12).plot(kind='barh', figsize=(10,10))

In [None]:
accuracy_play = []
for i in range(10):
    cv = KFold(n_splits=5, shuffle=True, random_state=i)
    scores_play = cross_val_score(pipeline, X,y, scoring='accuracy', cv=cv, n_jobs=-1)
    accuracy_play.append(mean(scores_play)*100)

mean(accuracy_play)

In [None]:
recall_play = []
for i in range(10):
    cv = KFold(n_splits=5, shuffle=True, random_state=i)
    scores_recall_play = cross_val_score(pipeline, X,y, scoring='recall', cv=cv, n_jobs=-1)
    recall_play.append(mean(scores_recall_play)*100)
    
mean(recall_play)

In [None]:
precision_play = []
for i in range(10):
    cv = KFold(n_splits=5, shuffle=True, random_state=i)
    scores_precision_play = cross_val_score(pipeline, X,y, scoring='precision', cv=cv, n_jobs=-1)
    precision_play.append(mean(scores_precision_play)*100)
    
mean(precision_play)

In [None]:
mean(accuracy_play)

In [None]:
index = ['Speech activity', 'intervals/min', 'avg speech duration', 'std speech duration', 'avg silence duration', 
        'std silence duration', 'overlapping speech', 'avg os', 'std os', 'speech caregiver', 'speech child']


In [None]:
coefs_meal[0], coefs_play[0]

In [None]:
df = pd.DataFrame({'meal':abs(coefs_meal[0]), 'play':abs(coefs_play[0])}, index = index)

In [None]:
df.sort_values(by='meal')

In [None]:
df

In [None]:
plt.rcParams["figure.figsize"] = [35, 10]
df.sort_values(by='meal', ascending=False).plot.bar(rot=0)
plt.xticks(fontsize=16)
plt.legend(fontsize=20)
plt.yticks(fontsize=20)
plt.savefig('feature_importance.jpg')

In [None]:
# getting the confusion matrix 

kf = KFold(n_splits=5, random_state=1, shuffle=True)
cm_holder = []
for train_index, test_index in kf.split(X):
        # scale training and test data based on statistics of only training data
        scaler = StandardScaler()
        X_train_, X_test_ = X[train_index], X[test_index]
        y_train_, y_test_ = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train_)
        # normalize data
        X_train_, X_test_ = scaler.transform(X_train_), scaler.transform(X_test_)
        model = SVC(kernel='linear', probability=True)
        model.fit(X_train_, y_train_)
        print(model.predict(X_test_))
        cm_temp = []
        cm_temp.append(confusion_matrix(y_test_, model.predict(X_test_)))
        print(cm_temp)
        accuracy = accuracy_score(y_test_, model.predict(X_test_))
        print(f"Accuracy {accuracy:3f}")
        print(model.coef_)
        coefficients_meal.append(model.coef_)
        cm_holder.append(sum(cm_temp))

In [None]:
matrix_play= sum(cm_holder)
matrix_play

In [None]:
import itertools 
from matplotlib.pyplot import figure

figure(figsize=(8, 6), dpi=80)
plt.imshow(matrix_play,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Play', fontsize=16)
plt.xlabel('Predicted', fontsize=16)
plt.ylabel('Actual', fontsize=16)
tick_marks = np.arange(len(set(y_test_))) # length of classes
class_labels = ['no rad','rad']
tick_marks
plt.xticks(tick_marks,class_labels, fontsize=16)
plt.yticks(tick_marks,class_labels, fontsize=16)
# plotting text value inside cells
thresh = matrix_play.max() / 2.
for i,j in itertools.product(range(matrix_play.shape[0]),range(matrix_play.shape[1])):
    plt.text(j,i,format(matrix_play[i,j],'d'),horizontalalignment='center',color='black' if matrix_play[i,j] >thresh else 'black', fontsize='xx-large')
plt.show();

# Breakdown of meal and play agreements and disagreements

In [None]:
predictions = []
s_id = []
labels=[]
for i in range(10):


    kf = KFold(n_splits=5, random_state=i, shuffle=True)
    for train_index, test_index in kf.split(X,y):
        prediction = []
        subject=[]
        label = []
            # scale training and test data based on statistics of only training data
        scaler = StandardScaler()
        X_train_, X_test_ = X[train_index], X[test_index]
        y_train_, y_test_ = y[train_index], y[test_index]
        scaler = StandardScaler()
        scaler.fit(X_train_)
            # normalize data
        X_train_, X_test_ = scaler.transform(X_train_), scaler.transform(X_test_)
        model = SVC(kernel='linear', probability=True)
        model.fit(X_train_, y_train_)
       # prediction.append(model.predict(X_test_))
        predictions.append(model.predict(X_test_))
        #label.append(y_test_)
      #  subject.append(model.predict(X_test_))
        s_id.append(subject_id[test_index])
        labels.append(y_test_)
        
    
    


In [None]:

tuple_index_pred = list()

for x, y,z in zip(predictions, s_id, labels):
    for l,m,n in zip(x,y,z):
        tuple_index_pred.append((l,m,n))

In [None]:
t_list = list()
t_list2 = list()
t_list3=list()
for x in tuple_index_pred:
    t_list.append(x[0])
    t_list2.append(x[1])
    t_list3.append(x[2])

final_df = pd.DataFrame(list(zip(t_list, t_list2, t_list3)), columns = ['prediction', 's_id', 'label'])
final_df

In [None]:
#final_df=final_df.set_index('s_id')
#final_df.sort_index()
final_df.groupby('s_id').prediction.mean()

In [None]:
final_df.sort_values(by='s_id').head(11)