## Random Forest Classifier

#### Load the emotions.csv dataset

In [104]:
import pandas as pd
import numpy as np
import time
import datetime
import pickle

file = 'test8.csv'
emotions = pd.read_csv(file, encoding='latin-1')
emotions.columns

Index(['filename', 'background', 'aeroplane', 'bicycle', 'bird', 'boat',
       'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
       'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
       'train', 'tv/monitor', 'Red', 'Red instances', 'Yellow',
       'Yellow instances', 'Green', 'Green instances', 'Cyan',
       'Cyan instances', 'Blue', 'Blue instances', 'Magenta',
       'Magenta instances', 'emotion'],
      dtype='object')

#### Remove filename column and change column headers

In [105]:
emotions = emotions.drop('filename', axis=1)
emotions.columns = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
       'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
       'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
       'train', 'tv/monitor', 'red', 'n_red', 'yellow',
       'n_yellow', 'green', 'n_green', 'cyan',
       'n_cyan', 'blue', 'n_blue', 'magenta',
       'n_magenta', 'emotion']

#### Show dataframe

In [106]:
emotions = emotions[:30]
# emotions_c = pd.DataFrame()
emotions_c = emotions.copy()
emotions_c['emotion'] = ""
emotions

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,0.002906,1,0.000258,3,0.0,0,2.6e-05,2,aniticipation
1,0.999616,0.0,0.0,0.000384,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.003154,1,4.7e-05,1,0.000115,2,0.093388,7,ambiguous
2,0.995435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,0.523868,7,0.0507,10,0.0,0,0.001069,6,sadness
3,0.99773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.00576,3,0.003157,1,0.001407,5,0.030759,7,ambiguous
4,0.996623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,sadness
5,0.730527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003894,0.0,...,4,0.003192,1,0.0,0,0.004544,3,0.009315,4,anticipation
6,0.971356,0.0,0.0,0.000762,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0.053292,4,0.098694,7,2.1e-05,1,0.0078,6,sadness
7,0.980808,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0,...,1,0.17766,5,0.000494,1,0.027814,5,0.124649,5,ambiguous
8,0.861151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,ambiguous
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,5.1e-05,1,ambiguous


#### Drop rows with NaN values and segregate initial dataframes with labeled and unlabeled data

In [107]:
emotions_labeled = emotions.dropna()

# fix wrong spelling
emotions_labeled = emotions_labeled.replace(["aniticipation"], "anticipation")
emotions_labeled.at[0, 'emotion'] = "joy"
emotions_labeled.at[1, 'emotion'] = "joy"
emotions_labeled.at[3, 'emotion'] = "anger"
emotions_labeled.at[5, 'emotion'] = "disgust"
emotions_labeled.at[6, 'emotion'] = "surprise"
emotions_labeled.at[8, 'emotion'] = "trust"
emotions_labeled.at[9, 'emotion'] = "trust"
emotions_labeled.at[13, 'emotion'] = "disgust"
emotions_labeled

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,0.002906,1,0.000258,3,0.0,0,2.6e-05,2,joy
1,0.999616,0.0,0.0,0.000384,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.003154,1,4.7e-05,1,0.000115,2,0.093388,7,joy
2,0.995435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,0.523868,7,0.0507,10,0.0,0,0.001069,6,sadness
3,0.99773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.00576,3,0.003157,1,0.001407,5,0.030759,7,anger
4,0.996623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,sadness
5,0.730527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003894,0.0,...,4,0.003192,1,0.0,0,0.004544,3,0.009315,4,disgust
6,0.971356,0.0,0.0,0.000762,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0.053292,4,0.098694,7,2.1e-05,1,0.0078,6,surprise
7,0.980808,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0,...,1,0.17766,5,0.000494,1,0.027814,5,0.124649,5,ambiguous
8,0.861151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,trust
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,5.1e-05,1,trust


In [108]:
# replace NaN values with empty string
# reference: https://stackoverflow.com/questions/13851535/how-to-delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression
emotions_unlabeled = emotions.replace([np.nan], "")
emotions_unlabeled = emotions_unlabeled.drop(emotions_unlabeled[emotions_unlabeled['emotion'] != ""].index)

# adjust indices for the unlabeled set
emotions_unlabeled.index = range(len(emotions_unlabeled.index))
emotions_unlabeled

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,0.999563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.10385,7,0.439431,8,0.047956,2,0.138944,7,
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,0.095642,5,0.334543,10,0.0,0,0.089827,7,
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.000237,2,0.0,0,0.09079,8,0.250242,6,
3,0.990694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.000176,1,0.0,0,0.000106,2,0.337888,7,
4,0.967867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003141,0.0,...,1,0.04421,6,0.330394,7,0.000532,2,0.01733,7,
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.021197,7,0.116276,5,0.008519,3,0.101481,7,
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.000424,2,0.0,0,0.992504,12,0.000842,2,
7,0.993159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.060129,4,0.254735,4,2.9e-05,1,0.0025,7,
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.001809,5,0.0,0,0.073804,7,0.186349,7,
9,0.995331,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.028024,5,0.003226,3,0.086159,3,0.036296,7,


#### Set data X and target y for the labeled data and the unlabeled data

In [109]:
X_labeled, y_labeled = emotions_labeled.drop('emotion', axis=1), emotions_labeled['emotion']
X_unlabeled, y_unlabeled = emotions_unlabeled.drop('emotion', axis=1), emotions_unlabeled['emotion']
y_labeled

0              joy
1              joy
2          sadness
3            anger
4          sadness
5          disgust
6         surprise
7        ambiguous
8            trust
9            trust
10    anticipation
11       ambiguous
12            fear
13         disgust
14            fear
Name: emotion, dtype: object

#### Split the dataset into training and test sets

In [110]:
from sklearn.model_selection import train_test_split
X_train_labeled, X_test_labeled, y_train_labeled, y_test_labeled = train_test_split(X_labeled, y_labeled, random_state=0)

#### Whole process of constructing labeled data from predicted labels (Semi-supervised learning)

In [111]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from operator import itemgetter
from sklearn.model_selection import GridSearchCV
# only use first default hyperparameters for experimenting only

# set hyperparameters of classifier
param_grid = {'n_estimators' : [100],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [1, 2],
              'min_samples_leaf' : [1, 2, 3],
              'max_features' : ["auto", "sqrt", "log2", 0.9, 0.2],
              'oob_score' : [True],
              'n_jobs' : [-1],
              'random_state' : [42]}

# rfc = GridSearchCV(RandomForestClassifier(), param_grid, cv=10)
# rfc = RandomForestClassifier(n_estimators=500,
#                              criterion='entropy',
#                              max_leaf_nodes=16,
#                              n_jobs=-1,
#                              random_state=0)
elapsed_time_list = []
elapsed_time_iter_list = []
best_score_list = []
best_cross_val_score = []
oob_score_list = []
iteration_counter = 0
start_time = datetime.datetime.now()
print("---------- Start Time - {:s} ----------".format(str(start_time)))

# loop if not all target values have emotions
# loop until everything is labeled
while(emotions['emotion'].isnull().values.any()):
    start_time_iter = datetime.datetime.now()
    # incrementing iteration_counter
    iteration_counter += 1
        
    # grid search for random forest with 2 standard cross-validation
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=2)
    grid_search.fit(X_train_labeled, y_train_labeled)
    best_params = grid_search.best_params_
    
    # set best base parameters
    rfc = RandomForestClassifier()
    rfc.set_params(**best_params)
    
    # train the classifier
    rfc.fit(X_train_labeled, y_train_labeled)
    
    # save model per iteration
    with open('ssl_rf.cpickle', 'wb') as file:
        pickle.dump(rfc, file)

    # gather class probabilities for each instance prediction
    y_pred_rfc = rfc.predict_proba(X_unlabeled)
    
    # show performance score per run
    y_pred_rfc_labeled = rfc.predict(X_test_labeled)
    # print("Accuracy score {}: {}".format(iteration_counter, accuracy_score(y_test_labeled, y_pred_rfc_labeled)))
    print("Test set score: {:.2f}".format(grid_search.score(X_test_labeled, y_test_labeled)))
    print("Best parameters: {}".format(grid_search.best_params_))
    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
    print("OOB Score: {:.2f}".format(rfc.oob_score_))
    
    # store scores in list
    best_score_list.append(grid_search.score(X_test_labeled, y_test_labeled))
    best_cross_val_score.append(grid_search.best_score_)

    # get highest class probability and pass it to a list
    probas = list(map((lambda x: x.max()), y_pred_rfc))
    yprfc = list(y_pred_rfc)

    # define threshold parameter
    threshold = 0.05

    # tuple of predicted X_instances, y_instances, and score for each instance prediction
    probas_indices = list(map((lambda x: (X_unlabeled[x['index']:x['index']+1], rfc.classes_[list(x['probas']).index(max(list(x['probas'])))], max(list(x['probas'])))), [{'index' : i, 'probas' : list(p)} for i, p in enumerate(y_pred_rfc)]))

    # sort tuple according to its score value
    sorted_probas_indices = sorted(probas_indices, key=itemgetter(2), reverse=True)

    # get top emotions from the tuple based on its threshold value
    # print("Sorted probas indices: {}".format(len(sorted_probas_indices)))
    slice_quantity = int(len(sorted_probas_indices)*threshold)
    sorted_probas_indices_threshold = sorted_probas_indices[:1 if slice_quantity < 1 else slice_quantity]

    # serpate values for X and y instances of top emotions
    # len(sorted_probas_indices_threshold)
    topy_emotions = list(map((lambda x: x[1]), sorted_probas_indices_threshold))
    # print(y_pred_rfc)
    # print(sorted_probas_indices_threshold)
    print(topy_emotions)
    topx_emotions_list = sorted_probas_indices_threshold

    # instantiate new DataFrame for accumulating all X instances
    topx_emotions = pd.DataFrame()

    # gather all X predicted instances
    for x in topx_emotions_list:
        topx_emotions = topx_emotions.append(x[0], sort=False)
    
    # adjust indices for topx_emotions
    # topx_emotions.index = range(len(topx_emotions.index))
    
    # remove topx_emotions from the unlabeled data
    # print(list(X_unlabeled.index))
    emotions_unlabeled = X_unlabeled.drop(X_unlabeled.index[list(topx_emotions.index)])

    # add again the target column of the universal set of unlabeled data
    emotions_unlabeled['emotion'] = None

    # set proper indices for the unlabeled set
    # emotions_unlabeled.index = range(len(emotions_labeled.index), len(emotions_labeled.index) + len(emotions_unlabeled.index))
    emotions_unlabeled.index = range(len(emotions_unlabeled.index))
    
    # add target column from the newly instantiated DataFrame along with its instances
    topx_emotions['emotion'] = topy_emotions
    top_emotions = topx_emotions

    # add the predicted instances DataFrame to the universal set of labeled data
    emotions_labeled = pd.concat([emotions_labeled, top_emotions], axis=0, sort=False)

    # fix previous indices to its current position in DataFrame
    emotions_labeled.index = range(len(emotions_labeled.index))

    # combine universal labeled and unlabeled sets into one
    emotions = pd.concat([emotions_labeled, emotions_unlabeled], axis=0, sort=False)

    # adjust indices for universal emotions set
    emotions.index = range(len(emotions.index))
    
    # Set data X and target y for the labeled data and the unlabeled data
    X_labeled, y_labeled = emotions_labeled.drop('emotion', axis=1), emotions_labeled['emotion']
    X_unlabeled, y_unlabeled = emotions_unlabeled.drop('emotion', axis=1), emotions_unlabeled['emotion']
    X_train_labeled, X_test_labeled, y_train_labeled, y_test_labeled = train_test_split(X_labeled, y_labeled, random_state=0)
    # print("Len of emotions_labeled: {}".format(len(emotions_labeled)))
    # print("Len of emotions_unlabeled: {}".format(len(emotions_unlabeled)))
    # print("Len of emotions: {}".format(len(emotions)))
    # print("Len of x_labeled:{}".format(len(X_labeled)))
    # print("Len of y_labeled:{}".format(len(y_labeled)))
    # print("Len of x_unlabeled:{}".format(len(X_unlabeled)))
    # print("Len of y_unlabeled:{}".format(len(y_unlabeled)))
    end_time_iter = datetime.datetime.now()
    elapsed_time_iter = end_time_iter - start_time_iter
    elapsed_time_iter_list.append(elapsed_time_iter)
    print("Accuracy {} elapsed time: {}".format(iteration_counter, str(elapsed_time_iter)))
    
    # save DataFrame per iteration
    emotions.to_csv("ssl_emotions.csv", encoding='utf-8')
    

end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
elapsed_time_list.append(elapsed_time)
print("---------- End Time - {:s} ----------".format(str(start_time)))
print("Elapsed time: {}".format(elapsed_time))

# save final DataFrame
emotions.to_csv("ssl_emotions.csv", encoding='utf-8')

# show DataFrame
emotions

---------- Start Time - 2018-09-17 18:21:01.351089 ----------




Test set score: 0.00
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.27
OOB Score: 0.00
['sadness']
Accuracy 1 elapsed time: 0:00:55.626338




Test set score: 0.00
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 0.9, 'min_samples_leaf': 2, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.33
OOB Score: 0.17
['sadness']
Accuracy 2 elapsed time: 0:00:55.978528




Test set score: 0.00
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.33
OOB Score: 0.33
['sadness']
Accuracy 3 elapsed time: 0:00:56.123790




Test set score: 0.00
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.38
OOB Score: 0.38
['sadness']
Accuracy 4 elapsed time: 0:00:56.537231




Test set score: 0.20
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'min_samples_leaf': 2, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.36
OOB Score: 0.36
['sadness']
Accuracy 5 elapsed time: 0:00:56.048620




Test set score: 0.40
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.33
OOB Score: 0.33
['sadness']
Accuracy 6 elapsed time: 0:00:55.964892




Test set score: 0.17
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.47
OOB Score: 0.47
['sadness']
Accuracy 7 elapsed time: 0:00:55.949819




Test set score: 0.33
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.56
OOB Score: 0.56
['sadness']
Accuracy 8 elapsed time: 0:00:56.042786




Test set score: 0.33
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 0.9, 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.59
OOB Score: 0.47
['sadness']
Accuracy 9 elapsed time: 0:00:57.033649




Test set score: 0.33
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.61
OOB Score: 0.56
['sadness']
Accuracy 10 elapsed time: 0:00:57.340627




Test set score: 0.71
Best parameters: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.50
OOB Score: 0.39
['sadness']
Accuracy 11 elapsed time: 0:00:57.839806




Test set score: 0.57
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 0.9, 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.63
OOB Score: 0.53
['sadness']
Accuracy 12 elapsed time: 0:00:56.539346




Test set score: 0.57
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.60
OOB Score: 0.60
['sadness']
Accuracy 13 elapsed time: 0:00:57.910388




Test set score: 0.57
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 0.9, 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.67
OOB Score: 0.52
['sadness']
Accuracy 14 elapsed time: 0:00:56.476210




Test set score: 0.75
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 0.2, 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.57
OOB Score: 0.48
['sadness']
Accuracy 15 elapsed time: 0:00:57.595443
---------- End Time - 2018-09-17 18:21:01.351089 ----------
Elapsed time: 0:14:09.116487


Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,0.002906,1,0.000258,3,0.0,0,2.6e-05,2,joy
1,0.999616,0.0,0.0,0.000384,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.003154,1,4.7e-05,1,0.000115,2,0.093388,7,joy
2,0.995435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,0.523868,7,0.0507,10,0.0,0,0.001069,6,sadness
3,0.99773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.00576,3,0.003157,1,0.001407,5,0.030759,7,anger
4,0.996623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,sadness
5,0.730527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003894,0.0,...,4,0.003192,1,0.0,0,0.004544,3,0.009315,4,disgust
6,0.971356,0.0,0.0,0.000762,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0.053292,4,0.098694,7,2.1e-05,1,0.0078,6,surprise
7,0.980808,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0,...,1,0.17766,5,0.000494,1,0.027814,5,0.124649,5,ambiguous
8,0.861151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,trust
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,5.1e-05,1,trust


#### Check number of emotions in dataframe

In [112]:
emotions_list = ["anger", "anticipation", "sadness", "surprise", "fear", "disgust", "joy", "trust"]
for x in emotions_list:
    print("Number of {}: {}".format(x, len(emotions.loc[emotions['emotion'] == x])))
# rfc.feature_importances_

Number of anger: 1
Number of anticipation: 1
Number of sadness: 17
Number of surprise: 1
Number of fear: 2
Number of disgust: 2
Number of joy: 2
Number of trust: 2


#### Comparison check if all newly made labeled set is the same with the base labeled set

In [113]:
emotions_o = emotions.copy()
emotions_o['emotion'] = "" 
df = pd.concat([emotions_o, emotions_c])
df = df.reset_index(drop=True)
df_gpby = df.groupby(list(df.columns))
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
df.reindex(idx)

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion


In [114]:
# y_pred_rfc_labeled = rfc.predict(X_labeled)
# type(y_pred_rfc_labeled)
# y_pred_rfc_labeled
# accuracy_score(np.array(y_labeled), y_pred_rfc_labeled)

In [115]:
# y_test = y_labeled.copy()
# y_test[0] = np.nan
# for x in range(len())

#### Define data X and target y for emotions data

In [117]:
X = emotions.drop('emotion', axis=1)
y = emotions['emotion']

#### Split the dataset into training and testing set

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#### Build final model from labeled data and show essential scores to evaluate the model

In [127]:
final_param_grid = {'n_estimators' : [100, 200, 500],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [1, 2],
              'min_samples_leaf' : [1, 2, 3],
              'max_features' : ["auto", "sqrt", "log2", 0.9, 0.2],
              'oob_score' : [True],
              'n_jobs' : [-1],
              'random_state' : [42]}

final_grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=2)
final_grid_search.fit(X_train, y_train)
final_best_params = final_grid_search.best_params_
final_rfc = RandomForestClassifier()
final_rfc.set_params(**final_best_params)
final_rfc.fit(X_train, y_train)
print("Test set score: {:.2f}".format(final_grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(final_grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(final_grid_search.best_score_))

# save final model
with open('final_ssl_rf.cpickle', 'wb') as file:
    pickle.dump(rfc, file)



Test set score: 0.62
Best parameters: {'criterion': 'entropy', 'max_depth': 2, 'max_features': 0.2, 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.68


#### 10 Standard cross-fold validation

In [128]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(final_rfc, X, y, cv=10)
print("10 standard cross-fold validation mean score: {}".format(np.mean(scores)))



10 standard cross-fold validation mean score: 0.8585714285714285


#### ROC Curve

In [133]:
# import matplotlib.pyplot as plt 
# from sklearn.metrics import roc_curve
# from sklearn.model_selection import cross_val_predict
# y_probas_forest = cross_val_predict(final_rfc, X_train, y_train, cv=10, method="predict_proba")
# y_train_2 = (y == "sadness")
# y_train_2 = cross_val_predict(final_rfc, X_train, y_train_2, cv=10, method="predict_proba")
# y_scores_forest = y_probas_forest[:, 1]
# fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_2, y_scores_forest)

# def plot_roc_curve(fpr, tpr, label=None):
#     plt.plot(fpr, tpr, linewidth=2, label=label)
#     plt.plot([0,1],[0,1],'k--')
#     plt.axis([0,1,0,1])
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')

# plot_roc_curve(fpr_forest, tpr_forest)
# plt.show()



ValueError: Found input variables with inconsistent numbers of samples: [22, 30]

In [134]:
emotions_unlabeled

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion


#### Do the 10 cross fold validation

In [116]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(rfc, X, y, cv=10)