In [1]:
import csv
from collections import defaultdict
import pandas as pd
import numpy as np
from random import shuffle
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix

import copy
import dill


import warnings
warnings.filterwarnings('ignore')

In [2]:
## in script mode, read filenames from arguments
import sys

# print('Number of arguments:', len(sys.argv), 'arguments.')
# print('Argument List:', str(sys.argv))
# INPUT_FILE = str(sys.argv[1])
# OUTPUT_FILE = str(sys.argv[2])

# comment the next two lines for Notebook mode
INPUT_FILE = "F4_T12_data.csv"
OUTPUT_FILE = "F4_improved.db"



In [52]:
dill.load_session("F4_report.db")

print("Reading data in from the csv...")
personalities = pd.read_csv("data/F4_T4_data.csv")
uniq_data = []
uniq_handle = {}
for index, row in personalities.iterrows():
    if row[0] in uniq_handle:
        continue
    else:
        uniq_handle[row[0]] = 1
        uniq_data.append(row)

print("Done reading data from csv")
        
data = pd.DataFrame(uniq_data)
data = data.drop(data.columns[[0]], axis=1)  # df.columns is zero-based pd.Index

all_features = data.iloc[:,0:(len(data.columns) - 1)].values.tolist()
all_labels = data.iloc[:,(len(data.columns) - 1)].values.tolist()

# print(all_features)

against = 0
brexit = 0
neutral = 0

for i in range(0, 10):
    predicted_labels = trained_clfs[i]["xgboost"].predict(np.array(all_features))
    unique, counts = np.unique(predicted_labels, return_counts=True)
    ddd = (dict(zip(unique, counts)))
    against += ddd[0]
    brexit += ddd[1]
    neutral += ddd[2]
    print(ddd)

print(against / 10)
print(brexit / 10)
print(neutral / 10)

Reading data in from the csv...
Done reading data from csv
{0.0: 7, 1.0: 2}
{0.0: 6, 1.0: 3}
{0.0: 7, 1.0: 2}
{0.0: 7, 1.0: 2}
{0.0: 8, 1.0: 1}
{0.0: 7, 1.0: 2}
{0.0: 9}
{0.0: 6, 1.0: 3}
{0.0: 7, 1.0: 2}
{0.0: 7, 1.0: 2}
7.1
0.0
0.0


In [3]:
print("Reading data in from the csv...")
personalities = pd.read_csv(INPUT_FILE)
uniq_data = []
uniq_handle = {}
for index, row in personalities.iterrows():
    if row[0] in uniq_handle:
        continue
    else:
        uniq_handle[row[0]] = 1
        uniq_data.append(row)

print("Done reading data from csv")
        
data = pd.DataFrame(uniq_data)
data = data.drop(data.columns[[0]], axis=1)  # df.columns is zero-based pd.Index 


NUM_FOLDS=10
print("Running with", NUM_FOLDS,"folds")


skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=20)
TRAIN_DATA=0
TRAIN_LABELS=1
TEST_DATA=2
TEST_LABELS=3

TRUE_LABEL = 0
PREDICTED_LABEL = 1

all_features = data.iloc[:,0:(len(data.columns) - 1)].values.tolist()
all_labels = data.iloc[:,(len(data.columns) - 1)].values.tolist()

index = 0

print("Splitting data into training and testing sets")
datasets = [[[],[],[],[]] for _ in range(NUM_FOLDS)]
for train, test in skf.split(all_features, all_labels):
    print(len(train), len(test))
    datasets[index][TRAIN_DATA] = [all_features[i] for i in train]
    datasets[index][TRAIN_LABELS] = [all_labels[i] for i in train]
    datasets[index][TEST_DATA] = [all_features[i] for i in test]
    datasets[index][TEST_LABELS] = [all_labels[i] for i in test]

    
    for li1 in datasets[index][TEST_DATA]:
        for li2 in datasets[index][TRAIN_DATA]:
            if li1 == li2:
                print("ERROR: (DUPLICATE FOUND)", li1)
                  
    index += 1
    
no_features = len(pd.DataFrame(all_features).columns)

Reading data in from the csv...
Done reading data from csv
Running with 10 folds
Splitting data into training and testing sets
1577 176
1577 176
1577 176
1578 175
1578 175
1578 175
1578 175
1578 175
1578 175
1578 175


In [13]:
tuned_parameters = {}
# Only for randomised hyper parameter tuning
tuned_parameters['KNN'] = {'n_neighbors' : list(range(2,200,5)), 
                            'leaf_size' : list(range(2, 200, 5)),
                            'p' : [1,2]
                           } 
tuned_parameters['random_forest'] = {'n_estimators' : list(range(5, 200, 5)),
                                     'min_samples_leaf' : list(range(5, 200, 3)), 
                                     'criterion' : ['gini', 'entropy'], 
                                     'min_samples_split' : list(range(2, 200, 5)),
                                     'class_weight':["balanced"]
                                    }
# tuned_parameters['gradient_boosting'] = {'learning_rate' : [0.0001, 0.001, 0.1], 
#                                          'n_estimators' : list(range(20, 200, 2)), 
#                                          'max_features' : list(range(2, no_features))
#                                         }
# tuned_parameters['xgboost'] = {'n_estimators' : list(range(2, 300, 2)), 
#                                'max_depth' : list(range(2, 100, 2)), 
#                                'learning_rate' : [0.0001, 0.001, 0.01, 0.1],
#                                'subsample':[0.6,0.7,0.75,0.8,0.85,0.9,0.95,1.0]
#                               }

clfs = defaultdict(list)
clfs = {}
confusion_matrices = defaultdict(list)
clfs['random_forest'] = RandomizedSearchCV(RandomForestClassifier(), 
                                           tuned_parameters['random_forest'], 
                                           n_jobs = -1, 
                                           n_iter=200,
                                           cv = 5,
                                           verbose = 1)
# clfs['gradient_boosting'] = RandomizedSearchCV(GradientBoostingClassifier(), 
#                                               tuned_parameters['gradient_boosting'], 
#                                               n_jobs = -1,
#                                               n_iter=500,
#                                               cv = 5,
#                                               verbose = 1)
# clfs['xgboost'] = RandomizedSearchCV(estimator = XGBClassifier(), 
#                                      param_distributions = tuned_parameters['xgboost'], 
#                                      n_jobs = -1,
#                                      n_iter=500,
#                                      cv = 5,
#                                      verbose = 1)
# clfs['logistic_regression'] = LogisticRegressionCV(Cs = [0.0001, 0.001, 0.1, 1.0, 10.0, 100, 1000],
#                                                    fit_intercept=True, 
#                                                    n_jobs = -1,
#                                                    max_iter = 500,
#                                                    cv = 5,
#                                                    verbose = 1)
#clfs['KNN'] = RandomizedSearchCV(KNeighborsClassifier(), 
#                                  tuned_parameters['KNN'], 
#                                  n_jobs = -1, 
#                                  n_iter=500,
#                                  cv = 5,
#                                  verbose = 1)

true_labels = {}
predicted_labels = {}
scores = {}
for classifier in clfs:
    scores[classifier] = [[] for _ in range(NUM_FOLDS)]
    true_labels[classifier] = [[] for _ in range(NUM_FOLDS)]
    predicted_labels[classifier] = [[] for _ in range(NUM_FOLDS)]

In [14]:
print("Training and predicting for all models: ")
trained_clfs = [[] for _ in range(NUM_FOLDS)]
for i, dataset in enumerate(datasets):
    training_data = dataset[TRAIN_DATA]
    training_labels = dataset[TRAIN_LABELS]
    testing_data = dataset[TEST_DATA]
    testing_labels = dataset[TEST_LABELS]
    
    ## initialise classifiers from list
    trained_clfs[i] = copy.deepcopy(clfs)

    for classifier in trained_clfs[i]:
        print("--> Training", classifier,", fold: ",i)
        trained_clfs[i][classifier].fit(np.array(training_data), training_labels)
        predicted_labels[classifier][i] = trained_clfs[i][classifier].predict(np.array(testing_data))
        true_labels[classifier][i] = testing_labels
        prec, rec, f1, _ = precision_recall_fscore_support(testing_labels, predicted_labels[classifier][i], average='macro')
        scores[classifier][i] = [prec, rec, f1]

Training and predicting for all models: 
--> Training random_forest , fold:  0
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 960 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   42.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  1
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   37.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  2
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 732 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   41.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  3
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 703 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   40.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  4
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 662 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   41.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  5
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 647 tasks      | elapsed:   25.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   40.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  6
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 263 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 740 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   39.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  7
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done 255 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 705 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 993 out of 1000 | elapsed:   39.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   39.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  8
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 876 tasks      | elapsed:   39.2s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   44.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


--> Training random_forest , fold:  9
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 314 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done 814 tasks      | elapsed:   36.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   44.4s finished


In [15]:
all_scores = {}
names = []
for classifier in scores:
    score = np.array([0,0,0])
    for i in range(len(scores[classifier])):
        score = score + np.array(scores[classifier][i])/NUM_FOLDS
    all_scores[classifier] = score  

for classifier in all_scores:
    print(classifier, all_scores[classifier])

random_forest [0.24881643 0.39814815 0.27078832]


In [6]:
dill.dump_session(OUTPUT_FILE)
print("Successfully completed everything!")

Successfully completed everything!


10