##### `This file trains the model on host-only dataset. Remember to include the dataset file (in npy) in the current folder and change parameters if needed before running the file.`

Last updated on August 4th, 2022

In [98]:
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import metrics
from sklearn.decomposition import PCA

#import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC

#for plotting
from matplotlib import pyplot as plt

In [99]:
#read in the data
#dataset.form = [id, speaker, label, some features, opensmile, ngram, embedding]
dataset = np.load("hostOnly.npy", allow_pickle = True)
dataset = np.delete(dataset, 3889, axis=1)
dataset = np.delete(dataset, 3894, axis=1)

#save the data into a csv file under the same directory
df = pd.DataFrame(dataset)
# df = df[df[1]=="david"]
# df.to_csv("data.csv")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6701,6702,6703,6704,6705,6706,6707,6708,6709,6710
0,s01e01-1,lee,0,3.300002310599893,71.86671698639998,1,1,0,0,0,...,0.2189566642045974,0.1774016618728637,-0.0945848301053047,-0.024067997932434,0.1494858413934707,-0.1999391764402389,0.1843986362218856,0.3149958252906799,-0.1232866644859314,-0.3925966322422027
1,s01e01-2,david,1,7.944450006999887,68.07782544459997,1,3,1,2,0,...,0.0690841525793075,0.0735771581530571,0.0210561528801918,0.2129583656787872,0.1802792251110077,-0.1111513823270797,0.1278924345970153,0.2297913879156112,-0.0376912206411361,-0.1709290593862533
2,s01e01-3,david,0,4.44444755630002,48.51114507770012,1,2,0,0,0,...,0.1405326575040817,0.2418599873781204,0.0783048346638679,0.210461676120758,0.0510850064456462,0.1438900083303451,0.2422073036432266,0.3750411570072174,0.3033879995346069,-0.2296533435583114
3,s01e01-4,lee,0,5.500011876200006,67.14458943080012,1,4,0,0,0,...,0.2103462517261505,0.1814827620983123,0.0559587553143501,-0.0095192492008209,0.2004977464675903,0.1327437460422516,0.4750293493270874,0.2450427561998367,0.3577592670917511,-0.2574981451034546
4,s01e01-5,david,1,4.922214076599971,137.44421699170002,1,2,0,0,0,...,0.0232509989291429,0.2042246013879776,0.1943531334400177,0.0881553590297699,0.1678855866193771,-0.2370918840169906,0.2678818702697754,0.2375513464212417,-0.023073399439454,-0.2727155089378357


In [100]:
def get_statistics(predict_y, real_y, ifNpArr=False):
    '''
    parameters:
    predict_y: npArr. the prediction array
    real_y: npArr. the actual label
    ifNpArr: the type of the output. If true, return a numpy array.

    return:
    npArr. or dict: the list or dictionary that stores the resulting data

    reference: https://mmuratarat.github.io/2019-10-01/how-to-compute-AUC-plot-ROC-by-hand
    '''
    #lie detection
    #lie is the positive class, truth is negative class
    assert predict_y.shape ==real_y.shape
    tp=0; tn=0; fp=0; fn=0
    for i in range(len(predict_y)):
        #the model predicts it is a lie/positive
        if predict_y[i] == 0:
            if real_y[i] == 0:
                tn += 1
            else:
                fn += 1 
        #the model predicts it is a truth/negative
        else:
            if real_y[i] == 1:
                tp += 1
            else:
                fp += 1

    accuracy = 1-np.count_nonzero(real_y-predict_y)/predict_y.shape[0]
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    specificity = fp/(fp+tn)
    f1 = 2*precision*recall/(precision+recall)
    fpr, tpr, thresholds = metrics.roc_curve(real_y, predict_y)
    auc = metrics.auc(fpr, tpr)

    if ifNpArr:
        return np.array([accuracy, precision, recall, specificity, f1, auc])
    else:
        return {"accuracy":"%.3f" % accuracy, "precision":"%.3f" % precision, "recall":"%.3f" % recall, "specificity":"%.3f" % specificity, "f1":"%.3f" % f1, "auc":"%.3f" % auc}

In [101]:
def train_model_for_individual_speaker(dataframe, speakerName, modelType, n=10, ifPCA=False, pcaNum=50):
    '''
    parameter:
    dataframe: pandasDf. dataset
    speakerName: str. the name of the host. Choose from "angus", "david", "lee", "rob"
    modelType: str. the name of the machine learning model
    n: the number of times to run before calculating percentiles
    ifPCA: bool. if to conduct dimensionality reduction. default false
    pcaNum: int. only effective when ifPCA is True. indicate the number of principal components to keep
    '''

    #choose a model
    if modelType == "LR":
        model = LogisticRegression()
    elif modelType == "RF":
        model = RandomForestClassifier()
    elif modelType == "DT":
        model = DecisionTreeClassifier()
    elif modelType == "NB":
        model = GaussianNB()
    elif modelType == "MLP":
        model = MLPClassifier(hidden_layer_sizes=(1095, 1095, 1095, 1095, 1095, 1095),solver="sgd",learning_rate_init=0.00134)
    elif modelType == "Ada":
        model = AdaBoostClassifier()
    elif modelType == "LSVM":
        model = LinearSVC()

    #process the data
    data = np.array(dataframe[dataframe[1]==speakerName])
    data = data[:,2:].astype(float)

    numTest = round(data.shape[0]*0.2)

    #conduct dimensionality reduction
    if ifPCA:
        #norm the data
        for i in range(1, data.shape[1]):
            if np.std(data[:,i])==0:
                continue
            else:
                data[:,i] = (data[:,i] - np.mean(data[:,i]))/np.std(data[:,i])

        pcaModel = PCA(pcaNum)
        pcaData = pcaModel.fit_transform(data[:,1:])
        data = np.column_stack((data[:,0],pcaData))

    statistics = []

    for i in range(n):
        #split the data
        np.random.shuffle(data)

        test_y = data[:numTest,0]
        test_x = data[:numTest,1:]
        train_y = data[numTest:,0]
        train_x = data[numTest:,1:]
        print(test_y)
        #train models
        model.fit(train_x, train_y)
        predict_y = model.predict(test_x)
        print(predict_y, test_y)
        statistics.append(get_statistics(predict_y, test_y, True))
    
    #make the pandas dataframe
    df = pd.DataFrame(statistics, columns=["accuracy", "precision", "recall", "specificity", "f1", "auc"])
    return df.quantile([0.25, 0.5, 0.75])


In [114]:
train_model_for_individual_speaker(df, "lee", "LR", 5)

[1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0.]
[0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]
[1. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.]
[1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0.]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,accuracy,precision,recall,specificity,f1,auc
0.25,0.571429,0.5,0.25,0.083333,0.25,0.571429
0.5,0.714286,0.5,0.5,0.1,0.333333,0.575
0.75,0.714286,0.5,0.5,0.2,0.5,0.65
