Machine Learning Pipeline used in this project. 

- Using Random Forest Classifier
- Nested Cross-Validation with Leave-One-Out (leave one participant out for testing)
- Testing algorithm at 3 moments during the video (beggining, middle, end)
- Predicting the type of the video (did the participant rate the video as: neutral, funny or very funny)
- Using permutation test to define the luck threshold
- 7 features:
    - Participant Smiles: AU6, AU12, AU25
    - Videos: Semantic, NGD, Saliecy, Movement

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit, LeaveOneGroupOut, RandomizedSearchCV, permutation_test_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

from statsmodels.stats.multicomp import MultiComparison
import scipy.stats as stats

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
"""
Use for (participant x video) for X time-cluster + use funny
"""

def create_data(temps, csv):
    df = pd.read_csv(csv).drop(columns=['Unnamed: 0', 'video']).dropna()

    df = df.loc[df['video_section']==temps[0]]
    df = df.loc[:, ~df.columns.str.startswith(temps[1])]
    df = df.loc[:, ~df.columns.str.startswith(temps[2])]

    df = df.dropna()

    X_df = df.drop(columns=['funny_quantile','video_section', 'siteweb_type'])
    X = X_df.values

    y = df['funny_quantile'].values

    groups = df['participantID'].values
    index_participant = list(X_df).index('participantID')

    X_df = X_df.rename(columns={'T{}X0Y0_Movement'.format(temps[3]):'Movement', 'T{}X0Y0_Saliency'.format(temps[3]):'Saliency','Anomaly':'Anomalie','Mean_DS_Google':'Semantic Distance'})
    print(list(X_df))
    X_df

    feature_names = list(X_df)
    feature_names.remove('participantID')

    
    return X, y, groups, index_participant, feature_names

# ML Pipeline

In [None]:
# Create empty Dataframe to store results
df_result = pd.DataFrame()
temps_all = [[1, 'T1', 'T2', '0', 'Debut'], [2, 'T0', 'T2', '1', 'Milieu'], [3, 'T0', 'T1', '2', 'Fin']]

csv = './dataset_prediction_funny.csv'
cv= 10
bootstrap= 100
n_iter = 100

# Iterate trough time
# for time in range(len(temps_all)):
for time in temps_all:
    print('\n\n**** Computation of Time Cluster :', time[4], '****\n\n')    
    X, y, groups, index_participant, feature_names = create_data(time, csv)
    
    """Step 1 : split into 10-90 test-train groups for n_split time"""
    gss = GroupShuffleSplit(n_splits=cv, test_size=0.1, random_state=0)
    i=0
    for train_index, test_index in gss.split(X, y, groups):
        print('Cross-Validation:',i+1,'on',cv)
        # Create the Xth train-test set
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Keep the participant IDs for the Leave-P-Group after
        train_groups = X_train[:,index_participant]
        test_groups  = X_test[:,index_participant]

        # Remove participant IDs from the data
        X_train = np.delete(X_train, index_participant,1)
        X_test  = np.delete(X_test, index_participant,1)
        
        print('Participant_Id for validation:', list(set(groups[test_index])))


        """Step 2: test hyperparameters with CV=leave-One-Group-Out"""

        logo = LeaveOneGroupOut()
        rfc = RandomForestClassifier()
        param_RFC = {
            'n_estimators': [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)],
            "criterion": ['gini', 'entropy'], 
            'max_depth': range(1,100), 
            'min_samples_split': range(1,100),
            'max_leaf_nodes': range(1,100),
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [int(x) for x in np.linspace(start = 1, stop = 200, num = 5)],
            'bootstrap': [True, False]}

        clf = RandomizedSearchCV(
            estimator = rfc,
            param_distributions = param_RFC, n_iter = n_iter,
            scoring='accuracy', n_jobs=-1, cv=logo, refit=True, return_train_score = True, random_state=0, verbose=3)

        best_clf = clf.fit(X_train, y_train, train_groups);


        """ Step 3: train the best algorithm on all data"""
        print('Best algorithm found. Training algorithm on new data')
        # Train rfc with the best hyperparameters
        best_rfc = RandomForestClassifier(**clf.best_params_)
        best_rfc.fit(X_train, y_train) 
        
        disp = plot_confusion_matrix(best_rfc, X_test, y_test)
        disp2 = plot_confusion_matrix(best_rfc, X_test, y_test, normalize='true', cmap=plt.cm.Blues)
        plt.show()

        print(classification_report(y_test, best_rfc.predict(X_test)))
        
        print('Accuracy for training set:', round(best_rfc.score(X_train, y_train), 3))
        print('Accuracy for validation set:', round(best_rfc.score(X_test, y_test), 3))
              
        # Get best features
        features_score = {feature_names[i]: best_rfc.feature_importances_[i] for i in range(len(feature_names))}

        # Prepare new row for Dataframe
        new_row=clf.best_params_
        new_row['time']=time[0]
        new_row['iteration']=i
        new_row['accuracy']= round(best_rfc.score(X_test, y_test), 3)   
        
        """ Step 4: Create bootstraping"""
        print('Trying {} permutations'.format(bootstrap))
        score, permutation_scores, pvalue = permutation_test_score(best_rfc, X_train, y_train, groups=train_groups, scoring="accuracy", cv=5, n_permutations=bootstrap, n_jobs=-1 ,verbose=3)

        new_row['permutation_score'] = score
        new_row['permutation_pvalue'] = pvalue
        new_row['permutation_mean'] = permutation_scores.mean()
        new_row['permutation_std'] = permutation_scores.std()
        
        #Merge row and append to df
        new_row = {**new_row, **features_score}
        df_result = df_result.append(new_row, ignore_index=True)
        i+=1
        
        # Plot Permutation tests
        fig, ax = plt.subplots()
        ax.hist(permutation_scores, bins=20, density=True, color='grey')
        ax.axvline(score, ls='--', color='r')
        score_label = (f"Score on original\ndata: {score:.2f}\n"
                       f"(p-value: {pvalue:.3f})")
        plt.show()

#         df_result.to_csv('./results_LOGO/prediction_funny.csv')
df_result

In [None]:
df_result[['AU6','AU12', 'AU25','Semantic Distance','NGD','Movement','Saliency']].T.sort_values(0, ascending=False)

# Time Analysis 

In [None]:
# import pandas as pd
# import seaborn as sns

# df_result = pd.read_csv('./results_LOGO/prediction_funny.csv')
# df_result = df_result.replace({"time":{1:'Beggining', 2:'Middle', 3:'End'}})
# feature_names  = ['AU12', 'AU25', 'AU6', 'Movement', 'NGD', 'Saliency', 'Semantic Distance']


In [None]:
feature_names = list(df_train.drop(columns=['funny_quantile','video']))
f = feature_names+['accuracy', 'time', 'iteration']
df_plot =df_result[f].melt(id_vars=['time', 'iteration', 'accuracy'])

df_plot['type'] = 'video'
df_plot.loc[df_plot['variable'] == 'AU6' , 'type'] = 'smile' # Funny
df_plot.loc[df_plot['variable'] == 'AU12' , 'type'] = 'smile' # Funny
df_plot.loc[df_plot['variable'] == 'AU25' , 'type'] = 'smile' # Funny

sns.relplot(x="time", y="value", kind="line", data=df_plot, hue='variable', ci='sd', color='dark2', style='type')


from anova import*
perm_value = 0.33#df_result['permutation_max'].max()
fig, ax = plt.subplots(figsize=(10, 2))
ax.axhline(perm_value, ls='--', color='red')

Anova_Improved(df_result, 'accuracy', group='time', alpha=0.05, ylim=0, graph=True, order=None, color='gray')
    
print(df_plot.groupby('time').mean())
print(df_plot.groupby('time').std())

In [None]:
df_result.groupby('time').max().round(3)