## Random Forest Classifier

#### Load the emotions.csv dataset

In [1]:
import pandas as pd
import numpy as np

file = 'test8.csv'
emotions = pd.read_csv(file, encoding='latin-1')
emotions.columns

Index(['filename', 'background', 'aeroplane', 'bicycle', 'bird', 'boat',
       'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
       'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
       'train', 'tv/monitor', 'Red', 'Red instances', 'Yellow',
       'Yellow instances', 'Green', 'Green instances', 'Cyan',
       'Cyan instances', 'Blue', 'Blue instances', 'Magenta',
       'Magenta instances', 'emotion'],
      dtype='object')

#### Remove filename column and change column headers

In [2]:
emotions = emotions.drop('filename', axis=1)
emotions.columns = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
       'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
       'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
       'train', 'tv/monitor', 'red', 'n_red', 'yellow',
       'n_yellow', 'green', 'n_green', 'cyan',
       'n_cyan', 'blue', 'n_blue', 'magenta',
       'n_magenta', 'emotion']

#### Show dataframe

In [3]:
emotions

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,8,0.002906,1,0.000258,3,0.000000,0,0.000026,2,aniticipation
1,0.999616,0.0,0.0,0.000384,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.003154,1,0.000047,1,0.000115,2,0.093388,7,ambiguous
2,0.995435,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,4,0.523868,7,0.050700,10,0.000000,0,0.001069,6,sadness
3,0.997730,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.005760,3,0.003157,1,0.001407,5,0.030759,7,ambiguous
4,0.996623,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,sadness
5,0.730527,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.003894,0.000000,...,4,0.003192,1,0.000000,0,0.004544,3,0.009315,4,anticipation
6,0.971356,0.0,0.0,0.000762,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,2,0.053292,4,0.098694,7,0.000021,1,0.007800,6,sadness
7,0.980808,0.0,0.0,0.000000,0.016949,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.177660,5,0.000494,1,0.027814,5,0.124649,5,ambiguous
8,0.861151,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,ambiguous
9,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000051,1,ambiguous


#### Drop rows with NaN values and segregate initial dataframes with labeled and unlabeled data

In [4]:
emotions_labeled = emotions.dropna()

# fix wrong spelling
emotions_labeled = emotions_labeled.replace(["aniticipation"], "anticipation")
emotions_labeled

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8,0.002906,1,0.000258,3,0.0,0,2.6e-05,2,anticipation
1,0.999616,0.0,0.0,0.000384,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0.003154,1,4.7e-05,1,0.000115,2,0.093388,7,ambiguous
2,0.995435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4,0.523868,7,0.0507,10,0.0,0,0.001069,6,sadness
3,0.99773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.00576,3,0.003157,1,0.001407,5,0.030759,7,ambiguous
4,0.996623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,sadness
5,0.730527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003894,0.0,...,4,0.003192,1,0.0,0,0.004544,3,0.009315,4,anticipation
6,0.971356,0.0,0.0,0.000762,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0.053292,4,0.098694,7,2.1e-05,1,0.0078,6,sadness
7,0.980808,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0,...,1,0.17766,5,0.000494,1,0.027814,5,0.124649,5,ambiguous
8,0.861151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,ambiguous
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0,0.0,0,0.0,0,5.1e-05,1,ambiguous


In [5]:
# replace NaN values with empty string
# reference: https://stackoverflow.com/questions/13851535/how-to-delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression
emotions_unlabeled = emotions.replace([np.nan], "")
emotions_unlabeled.drop(emotions_unlabeled[emotions_unlabeled['emotion'] != ""].index)

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
15,0.999563,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.103850,7,0.439431,8,0.047956,2,0.138944,7,
16,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,4,0.095642,5,0.334543,10,0.000000,0,0.089827,7,
17,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000237,2,0.000000,0,0.090790,8,0.250242,6,
18,0.990694,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000176,1,0.000000,0,0.000106,2,0.337888,7,
19,0.967867,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.003141,0.000000,...,1,0.044210,6,0.330394,7,0.000532,2,0.017330,7,
20,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.021197,7,0.116276,5,0.008519,3,0.101481,7,
21,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.000424,2,0.000000,0,0.992504,12,0.000842,2,
22,0.993159,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.060129,4,0.254735,4,0.000029,1,0.002500,7,
23,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.001809,5,0.000000,0,0.073804,7,0.186349,7,
24,0.995331,0.0,0.0,0.000002,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.028024,5,0.003226,3,0.086159,3,0.036296,7,


#### Set data X and target y for the labeled data and the unlabeled data

In [6]:
X_labeled, y_labeled = emotions_labeled.drop('emotion', axis=1), emotions_labeled['emotion']
X_unlabeled, y_unlabeled = emotions_unlabeled.drop('emotion', axis=1), emotions_unlabeled['emotion']

#### Split the dataset into training and test sets

In [7]:
from sklearn.model_selection import train_test_split
X_train_labeled, X_test_labeled, y_train_labeled, y_test_labeled = train_test_split(X_labeled, y_labeled, random_state=0)

#### Import, build, and train the Random Forest Classifier (Semi-supervised Learning)

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from operator import itemgetter
# only use first default hyperparameters for experimenting only
rfc = RandomForestClassifier(n_estimators=500,
                             max_leaf_nodes=16,
                             n_jobs=-1)
rfc.fit(X_labeled, y_labeled)
y_pred_rfc = rfc.predict_proba(X_unlabeled)
probas = list(map((lambda x: x.max()), y_pred_rfc))
probas_indices = list(map((lambda x: (rfc.classes_[list(x).index(max(list(x)))], max(list(x)))), y_pred_rfc))
sorted_probas_indices = sorted(probas_indices, key=itemgetter(1), reverse=True)
threshold = 0.05
sorted_probas_indices_threshold = sorted_probas_indices[:int(len(sorted_probas_indices)*threshold)]
# sorted_probas = sorted(probas, key=float, reverse=True)
# sorted_probas_threshold = sorted_probas[:int(len(sorted_probas)*0.05)]
top_emotions = list(map((lambda x: x[0]), sorted_probas_indices_threshold))
print(probas_indices)

[('anticipation', 0.684), ('ambiguous', 0.784), ('sadness', 0.804), ('ambiguous', 0.792), ('sadness', 0.776), ('anticipation', 0.71), ('sadness', 0.77), ('ambiguous', 0.726), ('ambiguous', 0.846), ('ambiguous', 0.85), ('anticipation', 0.65), ('ambiguous', 0.68), ('fear', 0.654), ('sadness', 0.76), ('fear', 0.656), ('sadness', 0.702), ('sadness', 0.708), ('fear', 0.466), ('ambiguous', 0.522), ('sadness', 0.684), ('sadness', 0.474), ('ambiguous', 0.472), ('sadness', 0.488), ('fear', 0.416), ('ambiguous', 0.462), ('sadness', 0.61), ('sadness', 0.634), ('ambiguous', 0.464), ('ambiguous', 0.498), ('ambiguous', 0.41), ('ambiguous', 0.596), ('sadness', 0.452), ('ambiguous', 0.606), ('ambiguous', 0.406), ('ambiguous', 0.46), ('ambiguous', 0.618), ('sadness', 0.638), ('ambiguous', 0.386), ('ambiguous', 0.422), ('ambiguous', 0.668), ('ambiguous', 0.426), ('sadness', 0.676), ('ambiguous', 0.602), ('ambiguous', 0.408), ('sadness', 0.74), ('ambiguous', 0.418), ('fear', 0.408), ('sadness', 0.736), (

#### Do the 10 cross fold validation

In [None]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(rfc, X, y, cv=10)