In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from numpy import mean
from numpy import std
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from pipelinehelper import PipelineHelper
from sklearn.decomposition import PCA

from collections import Counter

from sklearn.metrics import balanced_accuracy_score

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/classification/speech')

In [None]:
data = pd.read_excel('classification.xlsx', engine='openpyxl')

In [None]:
data.drop('Proportion speech child', axis=1, inplace=True)

In [None]:
data.iloc[:,1:11]

In [None]:
X = data.iloc[:,1:11].to_numpy()

In [None]:
groups = data['Subject_ID']

In [None]:
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
best_estimators = []

#accuracies = []


results = []
predictions = []
#X,y, groups = shuffle(X,y,data.index ,random_state=42)
X_shuffled, y_shuffled, groups_shuffled = shuffle(X,y,groups ,random_state=42)
cv = GroupKFold(n_splits=5)
pipe = Pipeline([
    ('scaler',StandardScaler()), 
    ('classifier', PipelineHelper([
        ('svm', SVC(class_weight='balanced')), 
        ('lr', LogisticRegression(class_weight='balanced')),
        ('tree', tree.DecisionTreeClassifier(class_weight='balanced')),
        ('forest', RandomForestClassifier(class_weight='balanced')),
        ('linear', LinearDiscriminantAnalysis()),
        ('qudratic', QuadraticDiscriminantAnalysis()),
        ('naive', GaussianNB())
    ])),
    ])
params = {
    'classifier__selected_model':pipe.named_steps['classifier'].generate({
          'svm__kernel': ['linear','rbf'],'svm__C':[0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma':[0.001, 0.01, 0.1, 1, 10, 100],
        'lr__penalty':['l1','l2'], 'lr__solver':['liblinear'],
                        'lr__C':np.logspace(-4,4,20)
        
    })
}
search = GridSearchCV(pipe, params, scoring = ['accuracy','balanced_accuracy','f1', 'precision','recall'],refit='balanced_accuracy', cv =cv, n_jobs=-1)

search.fit(X_shuffled, y_shuffled, groups = groups_shuffled)
print(search.best_score_)
print(search.best_params_)

In [None]:
df_grid_search = pd.DataFrame(search.cv_results_)
df_grid_search = df_grid_search.set_index('params')[['rank_test_accuracy', 'mean_test_accuracy', 'mean_test_f1','mean_test_precision', 'mean_test_recall']]
df_grid_search.sort_values(by = 'mean_test_accuracy', ascending=False).head(50)

In [None]:
from sklearn.model_selection import LeaveOneGroupOut
X_shuffled, y_shuffled, groups_shuffled = shuffle(X,y,groups ,random_state=42)

scoring = ['accuracy', 'recall','precision', 'f1']
scaler = preprocessing.StandardScaler()
clf = SVC(C= 100,gamma=0.001,kernel='linear', class_weight='balanced')
pipe = Pipeline(steps=[("scaler", scaler), ("classifier", clf)])
cv = LeaveOneGroupOut()
scores = cross_validate(pipe, X_shuffled, y_shuffled,groups=groups_shuffled, cv=cv, scoring=scoring, return_estimator=True)

In [None]:
scores['test_f1'].mean()

In [None]:
f_importances = [x.named_steps['classifier'].coef_[0] for x in  scores['estimator']]
np.mean(f_importances,axis=0)
features = pd.Series(np.mean(f_importances, axis=0), index=data.iloc[:,1:11].columns)
figure = features.abs().nlargest(13).plot(kind='barh')

In [None]:

model = SVC(kernel='linear')

In [None]:
# lets do group k fold
group_kfold = GroupKFold(n_splits=6)
for train_index, test_index in group_kfold.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

In [None]:
scores2 = cross_val_score(model, X, y, scoring='accuracy', cv=group_kfold, n_jobs=-1, groups=groups)

In [None]:
print('Accuracy: %.3f (%.3f)' % (mean(scores2), std(scores2)))

In [None]:
from scipy import stats
for train_index, test_index in group_kfold.split(X, y=y, groups=groups):
    print(np.unique(y[test_index], return_counts=True))

In [None]:
labels=list()
for train_index, test_index in group_kfold.split(X, y=y, groups=groups):
    labels.append(y[test_index])

In [None]:
predictions=list()
for train_index, test_index in group_kfold.split(X, y=y, groups=groups):
    model.fit(X[train_index], y[train_index])
    temp_list = model.predict(X[test_index])
    predictions.append(temp_list)

In [None]:
predictions

In [None]:
test_indexes

In [None]:
accuracy=0
for x,y in zip(predictions,labels):
    for prediction, label in zip(x,y):
        if int(prediction)==int(label):
            accuracy +=1


In [None]:
accuracy

In [None]:
accuracy/len(X)*100

In [None]:
true_positives=0
false_positives=0
false_negatives=0
for x,y in zip(predictions,test_indexes):
    for prediction, label in zip(x,y):
        if int(prediction)==int(label)==1:
            true_positives +=1
            
for x,y in zip(predictions,test_indexes):
    for prediction, label in zip(x,y):
        if int(prediction)==1 and int(label)==0:
            false_positives +=1

for x,y in zip(predictions,test_indexes):
    for prediction, label in zip(x,y):
        if int(prediction)==0 and int(label)==1:
            false_negatives +=1

In [None]:
true_positives

In [None]:
recall = true_positives/true_positives+false_negatives

In [None]:
for train_index, test_index in group_kfold.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)

In [None]:
model.predict(X)

In [None]:
from sklearn.metrics import precision_score

In [None]:
model

In [None]:
test_indexes

In [None]:
predictions

In [None]:
# let's try with a data pre-processing
X = data.iloc[:,1:12].to_numpy()
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]]) 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
print(scaler.fit(X))

In [None]:
print(scaler.mean_)

In [None]:
X = scaler.fit_transform(X,y)

In [None]:
len(X)

In [None]:
import statsmodels.api as sm

In [None]:
from scipy.stats import ttest_ind

In [None]:
from sklearn.utils import shuffle
X_shuffled, y_shuffled, groups_shuffled = shuffle(X, y, groups, random_state=0)
group_k_fold = GroupKFold(n_splits=6)
splits = group_k_fold.split(X_shuffled, y_shuffled, groups_shuffled)

In [None]:
for train_index, test_index in splits:
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)

In [None]:
model = SVC(kernel='linear')


In [None]:
y = np.array([0 if x=='no_rad' else 1 for x in data.iloc[:,-1]]) 

In [None]:
scores2 = cross_val_score(model, X, y, scoring='accuracy', cv=group_k_fold, n_jobs=-1, groups=groups)

In [None]:
print('Accuracy: %.3f (%.3f)' % (mean(scores2), std(scores2)))

In [None]:
for train_index, test_index in group_kfold.split(X_shuffled, y_shuffled, groups_shuffled):
    print(np.unique(y_shuffled[test_index], return_counts=True))

In [None]:
labels=list()
for train_index, test_index in group_kfold.split(X_shuffled, y=y_shuffled, groups=groups):
    labels.append(y[test_index])
labels

In [None]:
predictions=list()
for train_index, test_index in group_kfold.split(X_shuffled, y=y_shuffled, groups=groups):
    model.fit(X[train_index], y[train_index])
    temp_list = model.predict(X[test_index])
    predictions.append(temp_list)
predictions

In [None]:
accuracy=0
for x,y in zip(predictions,labels):
    for prediction, label in zip(x,y):
        if int(prediction)==int(label):
            accuracy +=1
accuracy

In [None]:
accuracy/len(X)*100

In [None]:
group_kfold = GroupKFold(n_splits=6)
for train_index, test_index in group_kfold.split(X, y, groups):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(X_train, X_test, y_train, y_test)