## Random Forest Classifier

#### Load the emotions.csv dataset

In [79]:
import pandas as pd
import numpy as np
import time
import datetime
import pickle
from os.path import isfile, join
import ast

file = 'test15.csv'
resume_flag = False
if isfile('ssl_emotions.csv'):
    file = 'ssl_emotions.csv'
    resume_flag = True
emotions = pd.read_csv(file, encoding='latin-1')
emotions.columns

Index(['filename', 'background', 'aeroplane', 'bicycle', 'bird', 'boat',
       'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
       'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
       'train', 'tv/monitor', 'Red', 'n_red', 'Yellow', 'n_yellow', 'Green',
       'n_green', 'Cyan', 'n_cyan', 'Blue', 'n_blue', 'Magenta', 'n_magenta',
       'emotion'],
      dtype='object')

### Export log functions

In [80]:
PRINT_SECONDS = 0
session_count = 0

def trackSession():
    global session_count
    if session_count == 0:
        session_count += 1
        return True
    else:
        return False
    return False

def setSession(file, flag=True):
    now = datetime.datetime.now()
    if flag:
        file.write("\n-------------------- SESSION - {} -------------------------\n".format(now.strftime("%Y-%m-%d %H:%M")))
    else:
        file.write("-------------------- SESSION - {} -------------------------\n".format(now.strftime("%Y-%m-%d %H:%M")))

def endSession(flag=True):
    now = datetime.datetime.now()
    if flag:
        return "\n-------------------- END SESSION - {} -------------------------\n".format(now.strftime("%Y-%m-%d %H:%M"))
    else:
        return "-------------------- END SESSION - {} -------------------------\n".format(now.strftime("%Y-%m-%d %H:%M"))
    return ""

def delayPrint(string, seconds, f="clf_logs.log"): # n seconds delay printing
    time.sleep(seconds)
    exportLogs(string, f)
    print(string)

def exportLogs(logs, f="clf_logs.log"):
    logs += "\n"
    if(isfile(f)):
        file = open(f, "a")
        if trackSession():
            setSession(file)
        file.write(logs)
        file.close()
    else:
        print("Log file does not exist!")
        print("Creating {} file...".format(f))
        file = open(f, "a+")
        if trackSession():
            setSession(file)
        file.write(logs)
        file.close()

### Save data functions

In [None]:
def initTimeData(time_labels, time_file="time.txt"):
    delayPrint("Checking {} file if empty...".format(time_file), PRINT_SECONDS)
    with open(f, "r+") as file:
        num_lines = sum(1 for line in file)
        if num_lines == 0:
            delayPrint("Initializing {} file...".format(time_file), PRINT_SECONDS)
            for x in time_labels:
                file.write("{}: []\n".format(x))
            delayPrint("Done initializing {}".format(time_file), PRINT_SECONDS)
        else:
            delayPrint("{} not empty...".format(time_file), PRINT_SECONDS)

# load time.txt file for resuming purposes
def getTimeData(time_file="time.txt")
    time_labels = []
    time_data = []
    with open(time_file, "r+") as file:
        delayPrint("Reading {} file...".format(time_file), PRINT_SECONDS)
        data = file.readlines()
        num_lines = len(data)
        for x in range(num_lines):
            dt = list(filter(None, re.split("(:\W)", data[x].rstrip()))) # filter removes blank strings in data[x] list
            time_label = dt[0]
            time_dt = ast.literal_eval(dt[len(dt) - 1])
            time_labels.append(time_label)
            time_data.append(time_dt)
            delayPrint("Fetched {} list...".format(label), PRINT_SECONDS)
    return time_labels, time_data

def saveTimeData(time_labels, time_data, time_file="time.txt"):
    with open(f, "w+") as file:
        print("Time Labels: {} Time Data: {}".format(time_labels, time_data))
        if len(time_labels) == len(time_data):
            for x in range(len(time_labels)):
                delayPrint("Saving time data...", PRINT_SECONDS)
                file.write("{} : {}\n".format(time_labels[x], time_data[x]))
            delayPrint("Saved time data...", PRINT_SECONDS)

def initScores(score_labels, score_file="scores.txt"):
    delayPrint("Checking {} file if empty...".format(score_file), PRINT_SECONDS)
    with open(f, "r+") as file:
        num_lines = sum(1 for line in file)
        if num_lines == 0:
            delayPrint("Initializing {} file...".format(score_file), PRINT_SECONDS)
            for x in score_labels:
                file.write("{}: []\n".format(x))
            delayPrint("Done initializing {}".format(score_file), PRINT_SECONDS)
        else:
            delayPrint("{} not empty...".format(score_file), PRINT_SECONDS)

# load scores.txt file for resuming purposes
def getScores(score_file="scores.txt")
    score_labels = []
    scores = []
    with open(score_file, "r+") as file:
        delayPrint("Reading {} file...".format(score_file), PRINT_SECONDS)
        data = file.readlines()
        num_lines = len(data)
        for x in range(num_lines):
            dt = list(filter(None, re.split("(:\W)", data[x].rstrip()))) # filter removes blank strings in data[x] list
            score_label = dt[0]
            score = ast.literal_eval(dt[len(dt) - 1])
            score_labels.append(score_label)
            scores.append(score)
            delayPrint("Fetched {} list...".format(score_label), PRINT_SECONDS)
    return score_labels, scores

def saveScores(score_labels, scores, score_file="scores.txt"):
    with open(f, "w+") as file:
        print("Score Labels: {} Scores: {}".format(score_labels, scores))
        if len(score_labels) == len(scores):
            for x in range(len(score_labels)):
                delayPrint("Saving scores...", PRINT_SECONDS)
                file.write("{} : {}\n".format(score_labels[x], scores[x]))
            delayPrint("Saved scores...", PRINT_SECONDS)

#### Remove filename column and change column headers

In [81]:
if not resume_flag:
    emotions = emotions.drop('filename', axis=1)
    emotions.columns = ['background', 'aeroplane', 'bicycle', 'bird', 'boat',
           'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
           'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
           'train', 'tv/monitor', 'red', 'n_red', 'yellow',
           'n_yellow', 'green', 'n_green', 'cyan',
           'n_cyan', 'blue', 'n_blue', 'magenta',
           'n_magenta', 'emotion']
else:
    emotions = emotions.drop("Unnamed: 0", axis=1)
emotions

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,8,0.002906,1,0.000258,3,0.000000,0,0.000026,2,AMBIGUOUS
1,0.999616,0.0,0.0,0.000384,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.003154,1,0.000048,1,0.000115,2,0.093388,7,AMBIGUOUS
2,0.995435,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,4,0.523868,7,0.050700,10,0.000000,0,0.001069,6,AMBIGUOUS
3,0.997730,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.005760,3,0.003157,1,0.001407,5,0.030759,7,AMBIGUOUS
4,0.996623,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,joy
5,0.730527,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.003894,0.000000,...,4,0.003192,1,0.000000,0,0.004544,3,0.009315,4,disgust
6,0.971356,0.0,0.0,0.000762,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,2,0.053292,4,0.098694,7,0.000021,1,0.007800,6,joy
7,0.980808,0.0,0.0,0.000000,0.016949,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.177660,5,0.000494,1,0.027814,5,0.124649,5,AMBIGUOUS
8,0.861151,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,trust
9,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000051,1,AMBIGUOUS


#### Show dataframe

In [83]:
# emotions = emotions[:616]
# emotions['emotion'][100:len(emotions)] = np.nan
# emotions_c = pd.DataFrame()
emotions_c = emotions.copy()
emotions_c['emotion'] = ""
emotions

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,8,0.002906,1,0.000258,3,0.000000,0,0.000026,2,AMBIGUOUS
1,0.999616,0.0,0.0,0.000384,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.003154,1,0.000048,1,0.000115,2,0.093388,7,AMBIGUOUS
2,0.995435,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,4,0.523868,7,0.050700,10,0.000000,0,0.001069,6,AMBIGUOUS
3,0.997730,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.005760,3,0.003157,1,0.001407,5,0.030759,7,AMBIGUOUS
4,0.996623,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,joy
5,0.730527,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.003894,0.000000,...,4,0.003192,1,0.000000,0,0.004544,3,0.009315,4,disgust
6,0.971356,0.0,0.0,0.000762,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,2,0.053292,4,0.098694,7,0.000021,1,0.007800,6,joy
7,0.980808,0.0,0.0,0.000000,0.016949,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.177660,5,0.000494,1,0.027814,5,0.124649,5,AMBIGUOUS
8,0.861151,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,trust
9,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000051,1,AMBIGUOUS


#### Drop rows with NaN values and segregate initial dataframes with labeled and unlabeled data

In [84]:
emotions_labeled = emotions.dropna()

# fix wrong spelling
emotions_labeled = emotions_labeled.replace(["aniticipation"], "anticipation")
# emotions_labeled.at[0, 'emotion'] = "joy"
# emotions_labeled.at[1, 'emotion'] = "joy"
# emotions_labeled.at[3, 'emotion'] = "anger"
# emotions_labeled.at[5, 'emotion'] = "disgust"
# emotions_labeled.at[6, 'emotion'] = "surprise"
# emotions_labeled.at[8, 'emotion'] = "trust"
# emotions_labeled.at[9, 'emotion'] = "trust"
# emotions_labeled.at[13, 'emotion'] = "disgust"
emotions_labeled

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,8,0.002906,1,0.000258,3,0.000000,0,0.000026,2,AMBIGUOUS
1,0.999616,0.000000,0.000000,0.000384,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,1,0.003154,1,0.000048,1,0.000115,2,0.093388,7,AMBIGUOUS
2,0.995435,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,4,0.523868,7,0.050700,10,0.000000,0,0.001069,6,AMBIGUOUS
3,0.997730,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0,0.005760,3,0.003157,1,0.001407,5,0.030759,7,AMBIGUOUS
4,0.996623,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,3,0.224839,5,0.305411,5,0.008561,4,0.031441,3,joy
5,0.730527,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.003894,0.000000,...,4,0.003192,1,0.000000,0,0.004544,3,0.009315,4,disgust
6,0.971356,0.000000,0.000000,0.000762,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,2,0.053292,4,0.098694,7,0.000021,1,0.007800,6,joy
7,0.980808,0.000000,0.000000,0.000000,0.016949,0.000000,0.000000,0.0,0.000000,0.000000,...,1,0.177660,5,0.000494,1,0.027814,5,0.124649,5,AMBIGUOUS
8,0.861151,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,trust
9,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.000000,0,0.000051,1,AMBIGUOUS


In [85]:
# replace NaN values with empty string
# reference: https://stackoverflow.com/questions/13851535/how-to-delete-rows-from-a-pandas-dataframe-based-on-a-conditional-expression
emotions_unlabeled = emotions.replace([np.nan], "")
emotions_unlabeled = emotions_unlabeled.drop(emotions_unlabeled[emotions_unlabeled['emotion'] != ""].index)

# adjust indices for the unlabeled set
emotions_unlabeled.index = range(len(emotions_unlabeled.index))
emotions_unlabeled

Unnamed: 0,background,aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair,...,n_yellow,green,n_green,cyan,n_cyan,blue,n_blue,magenta,n_magenta,emotion
0,0.997992,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,4,0.003019,2,0.004224,6,0.000953,1,0.050311,7,
1,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.001648,1,0.000819,1,0.000015,1,0.052613,4,
2,0.994149,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.114650,6,0.068308,6,0.038603,2,0.061172,5,
3,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.000000,0,0.000000,0,0.001070,2,0.318375,7,
4,0.995185,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000392,0.000000,...,0,0.038219,3,0.002694,3,0.000000,0,0.000136,1,
5,0.956097,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,1,0.051943,6,0.073436,6,0.054765,2,0.143866,7,
6,0.998073,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,2,0.000033,1,0.000016,1,0.000041,1,0.001265,1,
7,1.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.004541,1,0.003202,1,0.000000,0,0.013530,3,
8,0.995663,0.0,0.0,0.000000,0.004337,0.000000,0.0,0.0,0.000000,0.000000,...,2,0.000014,1,0.000000,0,0.000040,1,0.003980,2,
9,0.989746,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,...,0,0.068705,6,0.068782,5,0.000159,2,0.006770,5,


#### Set data X and target y for the labeled data and the unlabeled data

In [86]:
X_labeled, y_labeled = emotions_labeled.drop('emotion', axis=1), emotions_labeled['emotion']
X_unlabeled, y_unlabeled = emotions_unlabeled.drop('emotion', axis=1), emotions_unlabeled['emotion']
y_labeled

0      AMBIGUOUS
1      AMBIGUOUS
2      AMBIGUOUS
3      AMBIGUOUS
4            joy
5        disgust
6            joy
7      AMBIGUOUS
8          trust
9      AMBIGUOUS
10         trust
11     AMBIGUOUS
12           joy
13       sadness
14       sadness
15     AMBIGUOUS
16     AMBIGUOUS
17          fear
18     AMBIGUOUS
19     AMBIGUOUS
20     AMBIGUOUS
21       sadness
22     AMBIGUOUS
23     AMBIGUOUS
24     AMBIGUOUS
25     AMBIGUOUS
26     AMBIGUOUS
27     AMBIGUOUS
28     AMBIGUOUS
29       sadness
         ...    
278        trust
279    AMBIGUOUS
280          joy
281      sadness
282          joy
283          joy
284         fear
285          joy
286          joy
287         fear
288    AMBIGUOUS
289          joy
290     surprise
291      sadness
292          joy
293         fear
294      sadness
295          joy
296          joy
297          joy
298    AMBIGUOUS
299    AMBIGUOUS
300    AMBIGUOUS
301          joy
302    AMBIGUOUS
303          joy
304      sadness
305    AMBIGUO

#### Split the dataset into training and test sets

In [87]:
from sklearn.model_selection import train_test_split
X_train_labeled, X_test_labeled, y_train_labeled, y_test_labeled = train_test_split(X_labeled, y_labeled, random_state=0)

#### Whole process of constructing labeled data from predicted labels (Semi-supervised learning)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from operator import itemgetter
from sklearn.model_selection import GridSearchCV
# only use first default hyperparameters for experimenting only

# set hyperparameters of classifier
param_grid = {'n_estimators' : [100, 200, 500],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [1, 2],
              'min_samples_leaf' : [1, 2, 3],
              'max_features' : ["auto", "sqrt", "log2", 0.9, 0.2],
              'oob_score' : [True],
              'n_jobs' : [-1],
              'random_state' : [42]}

# rfc = GridSearchCV(RandomForestClassifier(), param_grid, cv=10)
# rfc = RandomForestClassifier(n_estimators=500,
#                              criterion='entropy',
#                              max_leaf_nodes=16,
#                              n_jobs=-1,
#                              random_state=0)
time_file = "time.txt"
scores_file = "scores.txt

# time lists
time_labels = ["Elapsed Time Iteration", "Time Passed"]
initTimeData(time_labels)
time_labels, time_data = getTimeData(time_file)
elapsed_time_list = []
elapsed_time_iter_list = time_data[0]
time_passed_list = time_data[1]

# scores lists
score_labels = ["Best Test Scores", "Best Cross-Validation Scores", "Out-of-bag Scores"]
initScores(score_labels)
score_labels, scores = getScores(scores_file)
best_score_list = scores[0]
best_cross_val_score = scores[1]
oob_score_list = scores[2]

iteration_counter = 0
start_time = datetime.datetime.now()

# print previous session
log_file = "logs.log"
if isfile(log_file):
    with open(log_file, "r+") as file:
        lines = file.readlines()
        for line in lines:
            print(line)
            
delayPrint("---------- Start Time - {:s} ----------".format(str(start_time)), PRINT_SECONDS)

# define threshold parameter
threshold = 0.05

# compute number of iterations produced
emotions_labeled_len = len(emotions_labeled)
emotions_unlabeled_len = len(emotions_unlabeled)
while(emotions_labeled_len < len(emotions)):
    iteration_counter += 1
    estimated_rows = 1 if int((threshold*emotions_unlabeled_len)) < 1 else int((threshold*emotions_unlabeled_len))
    emotions_labeled_len += estimated_rows
    emotions_unlabeled_len -= estimated_rows

# show total number of models
num_models_settings = 1
for k, v in param_grid.items():
    num_models_settings *= len(v)
total_num_models = iteration_counter * num_models_settings
delayPrint("Total number of models: {}".format(total_num_models), PRINT_SECONDS)

# set default iteration_counter
iteration_counter = 0
    
# loop if not all target values have emotions
# loop until everything is labeled
while(emotions['emotion'].isnull().values.any()):
    start_time_iter = datetime.datetime.now()
    
    iteration_counter_file = "iteration_counter.txt"
    
    # check saved interation_counter
    if isfile(iteration_counter_file):
        with open(iteration_counter_file, "r+") as file:
            iteration_counter = int(file.read())
    
    # show remaining number of models to produce
    num_models_current_settings = 1
    for k, v in param_grid.items():
        num_models_current_settings *= len(v)
    current_num_models = iteration_counter * num_models_current_settings
    rem_num_models = total_num_models - current_num_models
    delayPrint("Current number of models: {}".format(current_num_models), PRINT_SECONDS)
    delayPrint("Remaining number of models: {}".format(rem_num_models), PRINT_SECONDS)
    
    # incrementing iteration_counter
    iteration_counter += 1
        
    # grid search for random forest with 2 standard cross-validation
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=2)
    grid_search.fit(X_train_labeled, y_train_labeled)
    best_params = grid_search.best_params_
    
    # use saved model if there is
    rfc = RandomForestClassifier()
    model_file = 'ssl_rf.pickle'
    if isfile(model_file):
        delayPrint("Loading model...", PRINT_SECONDS)
        rfc = pickle.load(open(model_file, 'rb'))
    
    # set best base parameters
    rfc.set_params(**best_params)
    
    # train the classifier
    rfc.fit(X_train_labeled, y_train_labeled)
    
    # save model per iteration
    with open(model_file, 'wb') as file:
        delayPrint("Saving model...", PRINT_SECONDS)
        pickle.dump(rfc, file)

    # gather class probabilities for each instance prediction
    y_pred_rfc = rfc.predict_proba(X_unlabeled)
    
    # show performance score per run
    y_pred_rfc_labeled = rfc.predict(X_test_labeled)
    # print("Accuracy score {}: {}".format(iteration_counter, accuracy_score(y_test_labeled, y_pred_rfc_labeled)))
    delayPrint("Test set score: {:.2f}".format(grid_search.score(X_test_labeled, y_test_labeled)), PRINT_SECONDS)
    delayPrint("Best parameters: {}".format(grid_search.best_params_), PRINT_SECONDS)
    delayPrint("Best cross-validation score: {:.2f}".format(grid_search.best_score_), PRINT_SECONDS)
    delayPrint("Out-of-Bag Prediction Score: {:.2f}".format(rfc.oob_score_), PRINT_SECONDS)
    
    # store scores in list
    best_score_list.append(grid_search.score(X_test_labeled, y_test_labeled))
    best_cross_val_score.append(grid_search.best_score_)
    oob_score_list.append(rfc.oob_score_)
    saveScores(score_labels, [best_score_list, best_cross_val_score, oob_score_list], score_file)

    # get highest class probability and pass it to a list
    probas = list(map((lambda x: x.max()), y_pred_rfc))
    yprfc = list(y_pred_rfc)

    # define threshold parameter
    # threshold = 0.05

    # tuple of predicted X_instances, y_instances, and score for each instance prediction
    probas_indices = list(map((lambda x: (X_unlabeled[x['index']:x['index']+1], rfc.classes_[list(x['probas']).index(max(list(x['probas'])))], max(list(x['probas'])))), [{'index' : i, 'probas' : list(p)} for i, p in enumerate(y_pred_rfc)]))

    # sort tuple according to its score value
    sorted_probas_indices = sorted(probas_indices, key=itemgetter(2), reverse=True)

    # get top emotions from the tuple based on its threshold value
    # print("Sorted probas indices: {}".format(len(sorted_probas_indices)))
    slice_quantity = int(len(sorted_probas_indices)*threshold)
    sorted_probas_indices_threshold = sorted_probas_indices[:1 if slice_quantity < 1 else slice_quantity]

    # serpate values for X and y instances of top emotions
    # len(sorted_probas_indices_threshold)
    topy_emotions = list(map((lambda x: x[1]), sorted_probas_indices_threshold))
    # print(y_pred_rfc)
    # print(sorted_probas_indices_threshold)
    delayPrint("{}".format(topy_emotions), PRINT_SECONDS)
    topx_emotions_list = sorted_probas_indices_threshold

    # instantiate new DataFrame for accumulating all X instances
    topx_emotions = pd.DataFrame()

    # gather all X predicted instances
    for x in topx_emotions_list:
        topx_emotions = topx_emotions.append(x[0], sort=False)
    
    # adjust indices for topx_emotions
    # topx_emotions.index = range(len(topx_emotions.index))
    
    # remove topx_emotions from the unlabeled data
    # print(list(X_unlabeled.index))
    emotions_unlabeled = X_unlabeled.drop(X_unlabeled.index[list(topx_emotions.index)])

    # add again the target column of the universal set of unlabeled data
    emotions_unlabeled['emotion'] = None

    # set proper indices for the unlabeled set
    # emotions_unlabeled.index = range(len(emotions_labeled.index), len(emotions_labeled.index) + len(emotions_unlabeled.index))
    emotions_unlabeled.index = range(len(emotions_unlabeled.index))
    
    # add target column from the newly instantiated DataFrame along with its instances
    topx_emotions['emotion'] = topy_emotions
    top_emotions = topx_emotions

    # add the predicted instances DataFrame to the universal set of labeled data
    emotions_labeled = pd.concat([emotions_labeled, top_emotions], axis=0, sort=False)

    # fix previous indices to its current position in DataFrame
    emotions_labeled.index = range(len(emotions_labeled.index))

    # combine universal labeled and unlabeled sets into one
    emotions = pd.concat([emotions_labeled, emotions_unlabeled], axis=0, sort=False)

    # adjust indices for universal emotions set
    emotions.index = range(len(emotions.index))
    
    # Set data X and target y for the labeled data and the unlabeled data
    X_labeled, y_labeled = emotions_labeled.drop('emotion', axis=1), emotions_labeled['emotion']
    X_unlabeled, y_unlabeled = emotions_unlabeled.drop('emotion', axis=1), emotions_unlabeled['emotion']
    X_train_labeled, X_test_labeled, y_train_labeled, y_test_labeled = train_test_split(X_labeled, y_labeled, random_state=0)
    # print("Len of emotions_labeled: {}".format(len(emotions_labeled)))
    # print("Len of emotions_unlabeled: {}".format(len(emotions_unlabeled)))
    # print("Len of emotions: {}".format(len(emotions)))
    # print("Len of x_labeled:{}".format(len(X_labeled)))
    # print("Len of y_labeled:{}".format(len(y_labeled)))
    # print("Len of x_unlabeled:{}".format(len(X_unlabeled)))
    # print("Len of y_unlabeled:{}".format(len(y_unlabeled)))
    
    # time computations
    end_time_iter = datetime.datetime.now()
    elapsed_time_iter = end_time_iter - start_time_iter
    elapsed_time_iter_list.append(elapsed_time_iter)
    time_passed = end_time_iter - start_time
    time_passed_list.append(time_passed)
    print("elapsed_time_iter_list: {}".format(elapsed_time_iter_list))
    print("time_passed_list: {}".format(time_passed_list))
    saveTimeData(time_labels, [elapsed_time_iter_list, time_passed_list], time_file)
    delayPrint("Accuracy {} elapsed time: {}".format(iteration_counter, str(elapsed_time_iter)), PRINT_SECONDS)
    delayPrint("Time passed: {}\n".format(str(time_passed)), PRINT_SECONDS)
               
    # saving interation_counter
    if isfile(iteration_counter_file):
        with open(iteration_counter_file, "w+") as file:
            file.write(iteration_counter)
    
    # save DataFrame per iteration
    emotions.to_csv("ssl_emotions.csv", encoding='utf-8')
    

end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
elapsed_time_list.append(elapsed_time)
delayPrint("---------- End Time - {:s} ----------".format(str(start_time)), PRINT_SECONDS)
delayPrint("Elapsed time: {}".format(elapsed_time), PRINT_SECONDS)

# save final DataFrame
emotions.to_csv("ssl_emotions.csv", encoding='utf-8')

# show DataFrame
emotions

---------- Start Time - 2018-09-19 00:10:39.186118 ----------
Total number of models: 18000
Current number of models: 0
Remaining number of models: 18000
Saving model...
Test set score: 0.30
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 0.9, 'min_samples_leaf': 1, 'n_estimators': 200, 'n_jobs': -1, 'oob_score': True, 'random_state': 42}
Best cross-validation score: 0.42
Out-of-Bag Prediction Score: 0.39
['joy', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'joy', 'AMBIGUOUS', 'AMBIGUOUS', 'joy', 'AMBIGUOUS', 'AMBIGUOUS', 'joy', 'AMBIGUOUS', 'joy', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'joy', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'AMBIGUOUS', 'joy', 'AMBIGUOUS', 'AMBIGUOUS']
elapsed_time_iter_list: [datetime.timedelta(0, 351, 754359)]
time_passed_list: [datetime.timedelta(0, 351, 757557)]
Accuracy 1 elapsed time: 0:05:51.754359
Time passed

KeyboardInterrupt: 

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/home/butchersix/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/butchersix/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/butchersix/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/butchersix/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/butchersix/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 421, in execute_request
    self._abort_queues()
  File "/home/butchersix/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 636, in _abort_queues
    self._abort_queue(stream)
  File "/h

#### Check number of emotions in dataframe

In [None]:
emotions_list = rfc.classes_
for x in emotions_list:
    delayPrint("Number of {}: {}".format(x, len(emotions.loc[emotions['emotion'] == x])), PRINT_SECONDS)

#### Comparison check if all newly made labeled set is the same with the base labeled set

In [None]:
emotions_o = emotions.copy()
emotions_o['emotion'] = "" 
df = pd.concat([emotions_o, emotions_c])
df = df.reset_index(drop=True)
df_gpby = df.groupby(list(df.columns))
idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]
df.reindex(idx)

In [None]:
# y_pred_rfc_labeled = rfc.predict(X_labeled)
# type(y_pred_rfc_labeled)
# y_pred_rfc_labeled
# accuracy_score(np.array(y_labeled), y_pred_rfc_labeled)

In [None]:
# y_test = y_labeled.copy()
# y_test[0] = np.nan
# for x in range(len())

#### Define data X and target y for emotions data

In [None]:
X = emotions.drop('emotion', axis=1)
y = emotions['emotion']

#### Split the dataset into training and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

#### Build final model from labeled data and show essential scores to evaluate the model

In [None]:
final_param_grid = {'n_estimators' : [100, 200, 500],
              'criterion' : ['gini', 'entropy'],
              'max_depth' : [1, 2],
              'min_samples_leaf' : [1, 2, 3],
              'max_features' : ["auto", "sqrt", "log2", 0.9, 0.2],
              'oob_score' : [True],
              'n_jobs' : [-1],
              'random_state' : [42]}

final_grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=2)
final_grid_search.fit(X_train, y_train)
final_best_params = final_grid_search.best_params_
final_rfc = RandomForestClassifier()
final_rfc.set_params(**final_best_params)
final_rfc.fit(X_train, y_train)
print("Test set score: {:.2f}".format(final_grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(final_grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(final_grid_search.best_score_))

# save final model
with open('final_ssl_rf.pickle', 'wb') as file:
    pickle.dump(rfc, file)

#### 10 Standard cross-fold validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(final_rfc, X, y, cv=10)
print("10 standard cross-fold validation mean score: {}".format(np.mean(scores)))

#### ROC Curve

In [None]:
# import matplotlib.pyplot as plt 
# from sklearn.metrics import roc_curve
# from sklearn.model_selection import cross_val_predict
# y_probas_forest = cross_val_predict(final_rfc, X_train, y_train, cv=10, method="predict_proba")
# y_train_2 = (y == "sadness")
# y_train_2 = cross_val_predict(final_rfc, X_train, y_train_2, cv=10, method="predict_proba")
# y_scores_forest = y_probas_forest[:, 1]
# fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_2, y_scores_forest)

# def plot_roc_curve(fpr, tpr, label=None):
#     plt.plot(fpr, tpr, linewidth=2, label=label)
#     plt.plot([0,1],[0,1],'k--')
#     plt.axis([0,1,0,1])
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')

# plot_roc_curve(fpr_forest, tpr_forest)
# plt.show()

In [None]:
emotions_unlabeled

#### Do the 10 cross fold validation

In [None]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(rfc, X, y, cv=10)