In [114]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import (
    StandardScaler,
    LinearDiscriminantAnalysis as LDA,
)
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
import pandas as pd
from sklearn.model_selection import train_test_split
from mrmr import mrmr_classif
from sklearn.preprocessing import MinMaxScaler


# def feature_reduction_pca(x_train,x_test, variance: float):

#     pca = PCA(n_components=variance)

#     x_train = pca.fit_transform(x_train)
#     x_test = pca.transform(x_test)

#     return x_train,x_test


# def feature_reduction_lda(x_train,x_test, y_train):
#     """
#     LDA is supervised so we need a test and train split
#     """

#     # LDA
#     lda = LDA(n_components=1)
#     x_train = lda.fit_transform(x_train, y_train)
#     x_test = lda.transform(x_test)

#     return x_train,x_test


def feature_reduction_mrmr(x_train,x_test, y_train, n_components):
    selected_components = mrmr_classif(X=x_train, y=y_train, K=n_components)
    x_train = pd.DataFrame(x_train).loc[:, selected_components]
    x_test = pd.DataFrame(x_test).loc[:, selected_components]
    return x_train,x_test


In [126]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import (
    StandardScaler,
    LinearDiscriminantAnalysis as LDA,
)


CV_SPLIT = 5

"""
GridSearch for parameter optimisation
"""

def get_reducer_variables(reducer_name:str):
    if reducer_name == 'pca':
        return {'reducer':PCA(),'variables':[0.8,0.9,0.95]}
    elif reducer_name == 'lda':
        return {'reducer':LDA(),'variables':[1]}

    
def get_best_param_RF(x_train, y_train,reducer_name):
    if reducer_name != 'mrmr':
        reducer = get_reducer_variables(reducer_name=reducer_name)
        reducer_var = reducer['variables']
        reducer_name = reducer['reducer']
        
        pipe = Pipeline(steps=[('reducer',reducer_name),('rf',RandomForestClassifier())])

        param_grid = {
            "rf__n_estimators": [100, 200, 500],
            "rf__max_features": ["auto", "sqrt", "log2"],
            "rf__max_depth": [4, 5, 6, 7, 8],
            "rf__criterion": ["gini", "entropy"],
            'reducer__n_components':reducer_var
        }
        grid = GridSearchCV(pipe,param_grid=param_grid,refit=True, verbose=0, return_train_score=True, cv=CV_SPLIT)
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())

        return grid
    else:
        param_grid = {
            "n_estimators": [100, 200, 500],
            "max_features": ["auto", "sqrt", "log2"],
            "max_depth": [4, 5, 6, 7, 8],
            "criterion": ["gini", "entropy"],
        }
        grid = GridSearchCV(
            RandomForestClassifier(),
            param_grid,
            refit=True,
            verbose=0,
            return_train_score=True,
            cv=CV_SPLIT,
        )
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())
        return grid


def get_best_param_KNN(x_train, y_train,reducer_name):
    if reducer_name != 'mrmr':
        reducer = get_reducer_variables(reducer_name=reducer_name)
        reducer_var = reducer['variables']
        reducer_name = reducer['reducer']
        
        pipe = Pipeline(steps=[('reducer',reducer_name),('knn',KNeighborsClassifier())])

        param_grid = {
           "knn__n_neighbors": range(1, 21, 2),
           "knn__weights": ["uniform", "distance"],
           "knn__metric": ["euclidean", "manhattan", "minkowski"],
           'reducer__n_components':reducer_var
        }
        grid = GridSearchCV(pipe,param_grid=param_grid,refit=True, verbose=0, return_train_score=True, cv=CV_SPLIT)
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())

        return grid
    else:
        param_grid = {
            "n_neighbors": range(1, 21, 2),
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan", "minkowski"],
        }
        grid = GridSearchCV(
            KNeighborsClassifier(),
            param_grid,
            refit=True,
            verbose=0,
            return_train_score=True,
            cv=CV_SPLIT,
        )
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())
        return grid


def get_best_param_LR(x_train, y_train,reducer_name):

    if reducer_name != 'mrmr':
        reducer = get_reducer_variables(reducer_name=reducer_name)
        reducer_var = reducer['variables']
        reducer_name = reducer['reducer']
        
        pipe = Pipeline(steps=[('reducer',reducer_name),('lr',LogisticRegression())])

        param_grid = {
            "lr__C": [100, 10, 1.0, 0.1, 0.01],
            "lr__solver": ["newton-cg", "lbfgs", "liblinear"],
            "lr__penalty": ["l1", "l2"],
            'reducer__n_components':reducer_var
        }
        grid = GridSearchCV(pipe,param_grid=param_grid,refit=True, verbose=0, return_train_score=True, cv=CV_SPLIT)
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())

        return grid
    else:
        param_grid = {
            "C": [100, 10, 1.0, 0.1, 0.01],
            "solver": ["newton-cg", "lbfgs", "liblinear"],
            "penalty": ["l1", "l2"],
        }
        grid = GridSearchCV(
            LogisticRegression(),
            param_grid,
            refit=True,
            verbose=0,
            return_train_score=True,
            cv=CV_SPLIT,
        )
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())
        return grid


def get_best_param_SVC(x_train, y_train,reducer_name):

    if reducer_name != 'mrmr':
        reducer = get_reducer_variables(reducer_name=reducer_name)
        reducer_var = reducer['variables']
        reducer_name = reducer['reducer']
        
        pipe = Pipeline(steps=[('reducer',reducer_name),('svc',SVC())])

        param_grid = {
            "svc__C": [0.1, 1, 10, 100],
            "svc__gamma": [1, 0.1, 0.01, 0.001],
            "svc__kernel": ["rbf", "poly", "sigmoid"],
            'reducer__n_components':reducer_var
        }
        grid = GridSearchCV(pipe,param_grid=param_grid,refit=True, verbose=0, return_train_score=True, cv=CV_SPLIT)
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())

        return grid
    else:
        param_grid = {
            "C": [0.1, 1, 10, 100],
            "gamma": [1, 0.1, 0.01, 0.001],
            "kernel": ["rbf", "poly", "sigmoid"]        
        }
        grid = GridSearchCV(
            SVC(), param_grid, refit=True, verbose=0, return_train_score=True, cv=CV_SPLIT
        )
        grid.fit(x_train, y_train)
        print(grid.best_estimator_.get_params())
        return grid


Eval

In [116]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
import numpy as np


def division_function(n, d):
    if d:
        return n / d
    elif n == 0 and d == 0:
        return 0
    else:
        return None


def validate_model(model, X, Y):
    """
    validates the model with a k-fold validation which is iterated
    returns the mean accuracy, specificiy, recall, precision, f1 score and auc score
    """

    splits = 5
    iteration = 10

    acc_list = []
    specificity_list = []
    recall_list = []
    precision_list = []
    f1_list = []

    folds = StratifiedKFold(n_splits=splits)

    # Iterate "interation" times of k-fold
    for i in range(1, iteration):
        # print(f'Iteration {i}/{iteration}')

        acc_total = 0
        specificity_total = 0
        recall_total = 0
        precision_total = 0
        f1_total = 0

        for train_index, test_index in folds.split(X, Y):
            x_train = X.iloc[train_index, :]
            x_test = X.iloc[test_index, :]
            y_train = Y.iloc[train_index, :]
            y_test = Y.iloc[test_index, :]

            # scale
            sc = MinMaxScaler()
            x_train = sc.fit_transform(x_train)
            x_test = sc.transform(x_test)

            # fit model and predict
            model.fit(x_train, np.ravel(y_train))
            y_pred = model.predict(x_test)

            conf_matrix = confusion_matrix(y_test, y_pred)
            TN = conf_matrix[0][0]
            FP = conf_matrix[0][1]
            FN = conf_matrix[1][0]
            TP = conf_matrix[1][1]

            accuracy = (division_function((TP + TN), (TP + TN + FP + FN))) * 100
            recall = division_function(TP, (TP + FN)) * 100  # recall
            specificity = division_function(TN, (TN + FP)) * 100
            precision = division_function(TP, (TP + FP)) * 100
            f1_score = division_function(2 * (recall * precision), (recall + precision))

            # sum it up
            acc_total += accuracy
            recall_total += recall
            specificity_total += specificity
            precision_total += precision
            f1_total += f1_score

        # avg
        accuracy_mean = acc_total / splits
        recall_mean = recall_total / splits
        specificity_mean = specificity_total / splits
        precision_mean = precision_total / splits
        f1_mean = f1_total / splits

        acc_list.append(accuracy_mean)
        recall_list.append(recall_mean)
        specificity_list.append(specificity_mean)
        precision_list.append(precision_mean)
        f1_list.append(f1_mean)

    return (
        np.mean(acc_list),
        np.mean(specificity_list),
        np.mean(recall_list),
        np.mean(precision_list),
        np.mean(f1_list),
    )


def evaluate_model(model, x_train, x_test, y_train, y_test,x_test_index, df):
    # sc = MinMaxScaler()
    # x_train = sc.fit_transform(x_train)
    # x_test = sc.transform(x_test)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    ids = df.iloc[x_test_index]['id']
    data = {'id':ids,'y_pred':y_pred,'y_test':y_test}
    data_df = pd.DataFrame(data)
    data_df_group = data_df.groupby(by=['id'],sort=False).mean().round()
    y_pred_2 = data_df_group['y_pred']
    y_test_2 = data_df_group['y_test']
    
    conf_matrix = confusion_matrix(y_test_2, y_pred_2)
    TN = conf_matrix[0][0]
    FP = conf_matrix[0][1]
    FN = conf_matrix[1][0]
    TP = conf_matrix[1][1]

    accuracy = (division_function((TP + TN), (TP + TN + FP + FN))) * 100
    recall = division_function(TP, (TP + FN)) * 100  # recall
    specificity = division_function(TN, (TN + FP)) * 100
    precision = division_function(TP, (TP + FP)) * 100
    f1_score = division_function(2 * (recall * precision), (recall + precision))

    return accuracy, recall, specificity, precision, f1_score


In [124]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import os

path = '/Users/athena.kam/Documents/Thesis/codebase/thesis-2023-athena'
os.chdir(path)

"""
Reducing the features, splitting the data into test and train, oversample the training data, train the model and validate and evaluate it
"""


def read_and_split(
    filename: str, isTranscript: bool, reduce: str, random_state: int, chunked: bool
):
    df = pd.read_csv(filename)


    # Remove Personal Questions
    df = df[df["personalQ"] != 1].reset_index(drop=True)

    headers = df.columns
    non_embeddings_headers = []
    for header in headers:
        if header.find("embbedings") < 0:
            non_embeddings_headers.append(header)

    X = df.drop(columns=non_embeddings_headers)
    Y = df["classification"]
    X = MinMaxScaler().fit_transform(X)

    # Train-test split
    x_index = range(len(X))
    x_train_index, x_test_index, y_train, y_test = train_test_split(
        x_index, Y, test_size=0.30, random_state=random_state
    )
    x_train = pd.DataFrame(X).iloc[x_train_index]
    x_test = pd.DataFrame(X).iloc[x_test_index]

    # sc = MinMaxScaler()
    # x_train = sc.fit_transform(x_train)
    # x_test = sc.transform(x_test)
    # Oversample minority group
    sm = SMOTE(random_state=12)
    x_train, y_train = sm.fit_resample(x_train, y_train)

    if reduce == "mrmr":
        x_train,x_test = feature_reduction_mrmr(pd.DataFrame(x_train), pd.DataFrame(x_test), pd.DataFrame(y_train), 20)

    #     # Feature Reduction
    #     if reduce == "pca":
    #         x_train,x_test = feature_reduction_pca(x_train, x_test, 0.9)
    #     elif reduce == "lda":
    #         x_train,x_test = feature_reduction_lda(x_train, x_test, y_train)
    #     elif reduce == "mrmr":
    #         x_train,x_test = feature_reduction_mrmr(pd.DataFrame(x_train), pd.DataFrame(x_test), pd.DataFrame(y_train), 30)

    # else:
    #     if chunked:
    #         df.drop(["voiceID", "label_x"], inplace=True, axis=1)
    #         df.rename(columns={"label_y": "label"}, inplace=True)
    #     else:
    #         df.drop(["voiceID"], inplace=True, axis=1)
    #     df["label"].value_counts()
    #     df = df.dropna()

    #     df_X = df.iloc[:, :-1]
    #     df_Y = df.iloc[:, -1]

    #     x_train, x_test, y_train, y_test = train_test_split(
    #         df_X, df_Y, test_size=0.3, random_state=random_state
    #     )

    #     sc = MinMaxScaler()
    #     x_train = sc.fit_transform(x_train)
    #     x_test = sc.transform(x_test)
    #     pd.DataFrame(x_train)

    return x_train, x_test, y_train, y_test,x_test_index,df


def train_model(model_name: str, grid_search: bool, model_weights, x_train, y_train,reducer):
    if model_name == "svc":
        if grid_search:
            grid = get_best_param_SVC(x_train=x_train, y_train=y_train,reducer_name=reducer)
            model = grid.best_estimator_
        else:
            model = SVC(
                C=model_weights["C"],
                gamma=model_weights["gamma"],
                kernel=model_weights["kernel"],
            )
    elif model_name == "lr":
        if grid_search:
            grid = get_best_param_LR(x_train=x_train, y_train=y_train, reducer_name=reducer)
            model = grid.best_estimator_
        else:
            model = LogisticRegression(
                C=model_weights["C"],
                solver=model_weights["solver"],
                penalty=model_weights["penalty"],
            )
    elif model_name == "knn":
        if grid_search:
            grid = get_best_param_KNN(x_train=x_train, y_train=y_train,reducer_name=reducer)
            model = grid.best_estimator_
        else:
            model = KNeighborsClassifier(
                n_neighbors=model_weights["n_neighbors"],
                weights=model_weights["weights"],
                metric=model_weights["metric"],
            )
    elif model_name == "rf":
        if grid_search:
            grid = get_best_param_RF(x_train=x_train, y_train=y_train,reducer_name=reducer)
            model = grid.best_estimator_
        else:
            model = RandomForestClassifier(
                n_estimators=model_weights["n_estimators"],
                max_features=model_weights["max_features"],
                max_depth=model_weights["max_depth"],
                criterion=model_weights["criterion"],
            )

    return model


def train_test_model(
    filename: str,
    model_name: str,
    chunked: bool = False,
    reduce: str = "mrmr",
    isTranscript: bool = True,
    grid_search: bool = True,
    model_weights: dict = {},
    random_state: int = 0,
):
    x_train, x_test, y_train, y_test,x_test_index,df = read_and_split(
        filename=filename,
        isTranscript=isTranscript,
        reduce=reduce,
        random_state=random_state,
        chunked=chunked,
    )

    # Train ML model
    model = train_model(
        model_name=model_name,
        grid_search=grid_search,
        model_weights=model_weights,
        x_train=x_train,
        y_train=y_train,
        reducer=reduce    )

    # Validate with training data
    accuracy, specificiy, recall, precision, f1_score = validate_model(
        model, pd.DataFrame(x_train), pd.DataFrame(y_train)
    )

    print(
        f"\tAverage Accuracy: {accuracy} \n\
      Average Specificity: {specificiy} \n\
      Average Recall: {recall}\n\
      Average Precision:{precision}\n\
      Average F1 score {f1_score}\n\
      "
    )

    # Test with test data
    accuracy, specificiy, recall, precision, f1_score = evaluate_model(
        model=model, x_train=x_train, x_test=x_test, y_train=y_train, y_test=y_test,x_test_index = x_test_index,df=df
    )
    print("___________________")
    print("Evaluate model")
    print(
        f"\tAccuracy: {accuracy} \n\
    Specificity: {specificiy} \n\
    Recall: {recall}\n\
    Precision:{precision}\n\
    F1 score {f1_score}\n\
    "
    )

    return accuracy, specificiy, recall, precision, f1_score,model



Try

In [122]:
accuracy, specificiy, recall, precision, f1_score,model=train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_embeddings_transformed.csv', model_name= 'svc',reduce='lda')

{'memory': None, 'steps': [('reducer', LinearDiscriminantAnalysis(n_components=1)), ('svc', SVC(C=100, gamma=1))], 'verbose': False, 'reducer': LinearDiscriminantAnalysis(n_components=1), 'svc': SVC(C=100, gamma=1), 'reducer__covariance_estimator': None, 'reducer__n_components': 1, 'reducer__priors': None, 'reducer__shrinkage': None, 'reducer__solver': 'svd', 'reducer__store_covariance': False, 'reducer__tol': 0.0001, 'svc__C': 100, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 1, 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': False, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
	Average Accuracy: 71.33333333333333 
      Average Specificity: 66.66666666666666 
      Average Recall: 80.0
      Average Precision:71.66666666666666
      Average F1 score 73.14285714285714
      
___________________
Evaluate 

In [96]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_embeddings_transformed.csv', model_name= 'svc',reduce='pca')

{'memory': None, 'steps': [('reducer', PCA(n_components=0.8)), ('svc', SVC(C=1, gamma=0.1))], 'verbose': False, 'reducer': PCA(n_components=0.8), 'svc': SVC(C=1, gamma=0.1), 'reducer__copy': True, 'reducer__iterated_power': 'auto', 'reducer__n_components': 0.8, 'reducer__n_oversamples': 10, 'reducer__power_iteration_normalizer': 'auto', 'reducer__random_state': None, 'reducer__svd_solver': 'auto', 'reducer__tol': 0.0, 'reducer__whiten': False, 'svc__C': 1, 'svc__break_ties': False, 'svc__cache_size': 200, 'svc__class_weight': None, 'svc__coef0': 0.0, 'svc__decision_function_shape': 'ovr', 'svc__degree': 3, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__max_iter': -1, 'svc__probability': False, 'svc__random_state': None, 'svc__shrinking': True, 'svc__tol': 0.001, 'svc__verbose': False}
	Average Accuracy: 83.80952380952381 
      Average Specificity: 73.33333333333333 
      Average Recall: 93.33333333333333
      Average Precision:79.33333333333333
      Average F1 score 85.396825396825

(63.63636363636363,
 40.0,
 83.33333333333334,
 66.66666666666666,
 49.99999999999999,
 Pipeline(steps=[('reducer', PCA(n_components=0.8)),
                 ('svc', SVC(C=1, gamma=0.1))]))

In [104]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_embeddings_transformed.csv', model_name= 'lr',reduce='pca')

{'memory': None, 'steps': [('reducer', PCA(n_components=0.8)), ('lr', LogisticRegression(C=100, solver='newton-cg'))], 'verbose': False, 'reducer': PCA(n_components=0.8), 'lr': LogisticRegression(C=100, solver='newton-cg'), 'reducer__copy': True, 'reducer__iterated_power': 'auto', 'reducer__n_components': 0.8, 'reducer__n_oversamples': 10, 'reducer__power_iteration_normalizer': 'auto', 'reducer__random_state': None, 'reducer__svd_solver': 'auto', 'reducer__tol': 0.0, 'reducer__whiten': False, 'lr__C': 100, 'lr__class_weight': None, 'lr__dual': False, 'lr__fit_intercept': True, 'lr__intercept_scaling': 1, 'lr__l1_ratio': None, 'lr__max_iter': 100, 'lr__multi_class': 'auto', 'lr__n_jobs': None, 'lr__penalty': 'l2', 'lr__random_state': None, 'lr__solver': 'newton-cg', 'lr__tol': 0.0001, 'lr__verbose': 0, 'lr__warm_start': False}
	Average Accuracy: 78.0952380952381 
      Average Specificity: 68.33333333333333 
      Average Recall: 88.33333333333333
      Average Precision:73.333333333333

(81.81818181818183,
 80.0,
 83.33333333333334,
 80.0,
 80.0,
 Pipeline(steps=[('reducer', PCA(n_components=0.8)),
                 ('lr', LogisticRegression(C=100, solver='newton-cg'))]))

In [44]:
model = Pipeline(steps=[('reduce', PCA(n_components=0.8)),
                 ('svc', SVC(C=1, gamma=0.1))])

In [53]:
print(model[0])

PCA(n_components=0.8)


In [37]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_embeddings_transformed.csv', model_name= 'svc',reduce='lda')

TypeError: Parameter grid for parameter 'reduce__n_components' needs to be a list or a numpy array, but got 1 (of type int) instead. Single values need to be wrapped in a list with one element.

In [128]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_sentence_embeddings_transformed.csv', model_name= 'svc',reduce='lda')

{'C': 0.1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	Average Accuracy: 97.49007936507937 
      Average Specificity: 95.0 
      Average Recall: 100.0
      Average Precision:95.29106187929717
      Average F1 score 97.56572469818738
      


array([0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1])

Unnamed: 0,id,y_pred,y_test
366,ID35_hc_0_0_0_noPersonalQuestions.flac,0,0
155,ID11_hc_0_0_0_noPersonalQ.flac,1,0
313,ID28_hc_0_0_0_noPersonalQ.flac,0,0
269,ID22_hc_0_0_0_noPersonalQ.flac,0,0
221,ID15_hc_0_0_0_noPersonalQ.flac,0,0
...,...,...,...
144,ID10_hc_0_0_0_noPersonalQ.flac,0,0
12,ID00_hc_0_0_0.flac,0,0
101,ID07_pd_2_0_0_noPersonalQ.flac,0,1
293,ID25_hc_0_0_0_noPersonalQ.flac,0,0


id
ID35_hc_0_0_0_noPersonalQuestions.flac    1.0
ID11_hc_0_0_0_noPersonalQ.flac            0.0
ID28_hc_0_0_0_noPersonalQ.flac            1.0
ID22_hc_0_0_0_noPersonalQ.flac            0.0
ID15_hc_0_0_0_noPersonalQ.flac            0.0
ID05_hc_0_0_0.flac                        0.0
ID33_pd_3_2_2.flac                        0.0
ID04_pd_2_0_1_noPersonalQ.flac            0.0
ID09_hc_0_0_0.flac                        0.0
ID03_hc_0_0_0_noPersonalQ.flac            0.0
ID23_hc_0_0_0_noPersonalQ.flac            0.0
ID20_pd_3_0_1_noPersonalQ.flac            1.0
ID10_hc_0_0_0_noPersonalQ.flac            0.0
ID16_pd_2_0_0_noPersonalQ.flac            1.0
ID27_pd_4_1_1_noPersonalQ.flac            0.0
ID26_hc_0_0_0_noPersonalQ.flac            0.0
ID29_pd_3_1_2_noPersonalQ.flac            1.0
ID07_pd_2_0_0_noPersonalQ.flac            0.0
ID13_pd_3_2_2.flac                        0.0
ID17_pd_2_1_0.flac                        1.0
ID06_pd_3_1_1.flac                        0.0
ID36_hc_0_0_0_noPersonalQ.flac 

___________________
Evaluate model
	Accuracy: 60.526315789473685 
    Specificity: 50.0 
    Recall: 66.66666666666666
    Precision:46.666666666666664
    F1 score 48.275862068965516
    


(60.526315789473685,
 50.0,
 66.66666666666666,
 46.666666666666664,
 48.275862068965516)

In [131]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_sentence_embeddings_transformed.csv', model_name= 'svc',reduce='lda')

{'C': 0.1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	Average Accuracy: 97.49007936507937 
      Average Specificity: 95.0 
      Average Recall: 100.0
      Average Precision:95.29106187929717
      Average F1 score 97.56572469818738
      
___________________
Evaluate model
	Accuracy: 68.75 
    Specificity: 41.66666666666667 
    Recall: 85.0
    Precision:62.5
    F1 score 50.0
    


(68.75, 41.66666666666667, 85.0, 62.5, 50.0)

In [31]:
smth = {'reducer':PCA(),'variables':[0.8,0.9,0.95]}

In [32]:
smth['variables']

[0.8, 0.9, 0.95]

In [112]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_embeddings_transformed.csv', model_name= 'knn',reduce='mrmr')

100%|██████████| 30/30 [00:05<00:00,  5.67it/s]


{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'manhattan', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
	Average Accuracy: 96.66666666666667 
      Average Specificity: 93.33333333333333 
      Average Recall: 100.0
      Average Precision:95.0
      Average F1 score 97.14285714285714
      
___________________
Evaluate model
	Accuracy: 63.63636363636363 
    Specificity: 80.0 
    Recall: 50.0
    Precision:57.14285714285714
    F1 score 66.66666666666666
    


(63.63636363636363,
 80.0,
 50.0,
 57.14285714285714,
 66.66666666666666,
 KNeighborsClassifier(metric='manhattan', n_neighbors=3))

In [113]:
train_test_model(filename= 'datasets/transformed/google/spontaneousDialogueOnly_google_bert_embeddings_transformed.csv', model_name= 'rf',reduce='lda')

{'memory': None, 'steps': [('reducer', LinearDiscriminantAnalysis(n_components=1)), ('rf', RandomForestClassifier(criterion='entropy', max_depth=4, max_features='log2'))], 'verbose': False, 'reducer': LinearDiscriminantAnalysis(n_components=1), 'rf': RandomForestClassifier(criterion='entropy', max_depth=4, max_features='log2'), 'reducer__covariance_estimator': None, 'reducer__n_components': 1, 'reducer__priors': None, 'reducer__shrinkage': None, 'reducer__solver': 'svd', 'reducer__store_covariance': False, 'reducer__tol': 0.0001, 'rf__bootstrap': True, 'rf__ccp_alpha': 0.0, 'rf__class_weight': None, 'rf__criterion': 'entropy', 'rf__max_depth': 4, 'rf__max_features': 'log2', 'rf__max_leaf_nodes': None, 'rf__max_samples': None, 'rf__min_impurity_decrease': 0.0, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__min_weight_fraction_leaf': 0.0, 'rf__n_estimators': 100, 'rf__n_jobs': None, 'rf__oob_score': False, 'rf__random_state': None, 'rf__verbose': 0, 'rf__warm_start': False}


(81.81818181818183,
 100.0,
 66.66666666666666,
 71.42857142857143,
 83.33333333333333,
 Pipeline(steps=[('reducer', LinearDiscriminantAnalysis(n_components=1)),
                 ('rf',
                  RandomForestClassifier(criterion='entropy', max_depth=4,
                                         max_features='log2'))]))

In [127]:
train_test_model(filename= 'datasets/transformed/whisper/spontaneousDialogueOnly_whisper_bert_embeddings_transformed.csv', model_name= 'svc',reduce='mrmr')

100%|██████████| 20/20 [00:02<00:00,  7.31it/s]


{'C': 0.1, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'poly', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
	Average Accuracy: 96.66666666666667 
      Average Specificity: 93.33333333333333 
      Average Recall: 100.0
      Average Precision:95.0
      Average F1 score 97.14285714285714
      
___________________
Evaluate model
	Accuracy: 63.63636363636363 
    Specificity: 25.0 
    Recall: 85.71428571428571
    Precision:50.0
    F1 score 33.333333333333336
    


(63.63636363636363,
 25.0,
 85.71428571428571,
 50.0,
 33.333333333333336,
 SVC(C=0.1, gamma=1, kernel='poly'))