In [1]:
import csv
from collections import defaultdict
import pandas as pd
import numpy as np
from random import shuffle
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix

import copy
import dill


# In[603]:

import warnings
warnings.filterwarnings('ignore')

In [18]:
## in script mode, read filenames from arguments
import sys

print('Number of arguments:', len(sys.argv), 'arguments.')
print('Argument List:', str(sys.argv))
INPUT_FILE = str(sys.argv[1])
OUTPUT_FILE = str(sys.argv[2])

## comment the next two lines for Notebook mode
# INPUT_FILE = "../uniq_data.csv"
# OUTPUT_FILE = "rand_optimised_all_R.db"

Number of arguments: 3 arguments.
Argument List: ['/home/andrei/anaconda2/envs/vocationcompass/lib/python3.7/site-packages/ipykernel_launcher.py', '-f', '/run/user/1000/jupyter/kernel-17731f65-6dc1-446e-bdac-325804645e5d.json']


'/home/andrei/anaconda2/envs/vocationcompass/lib/python3.7/site-packages/ipykernel_launcher.py'

In [13]:
print("Reading data in from the csv...")
personalities = pd.read_csv(INPUT_FILE)
uniq_data = []
uniq_handle = {}
for index, row in personalities.iterrows():
    if row[0] in uniq_handle:
        continue
    else:
        uniq_handle[row[0]] = 1
        uniq_data.append(row)

print("Done reading data from csv")
        
personalities = pd.DataFrame(uniq_data)
#personalities.head()


# Now, let's look at the different types of professions that we have and the number of samples that we have for each.

# In[607]:

professions = personalities.groupby('Profession')
professions.size()
#with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
#    print(professions.size().order(ascending=False))


# In[608]:

data = np.array(personalities)


# In[609]:

anonymised_data = data[:,1:]


# In[610]:

data = defaultdict(list)


# In[611]:

for row in anonymised_data:
    data[row[0]].append([float(r) for r in row[1:]])


# In[612]:

TOP = 10
print("Number of professions to run models on:", TOP)

# In[613]:

NUM_FOLDS=10
TRAIN_CAP = 955
HYPER_PARAMETER_TUNING_ITER = 1
print("Running with", NUM_FOLDS,"folds")


# In[614]:

counts = {}

for key in data:
    counts[key] = len(data[key])

sorted_keys = [key for key in sorted(counts, 
                      key=counts.get, 
                      reverse=True)]


sorted_keys = sorted_keys[:TOP]
print("Professions we are using now: ")
for prof in sorted_keys:
    print(prof)
# Random uniform variable -> Sample from uniform distribution of paramaters min 0 and max 1
#                            Sample u from that
#                            If sample is lower than 950/sizeofclass then keep sample otherwise throw sample away
# sorted_keys = ['Athletics Director', 'Executive Chef', 'Data Scientist', 'Software Engineer', 'Event Coordinator', 'Health Manager', 'Literary Agent', 'Research Associate', 'Club Manager', 'Teacher']


# In[617]:

print(sorted_keys)

# In[619]:

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=20)
TRAIN_DATA=0
TRAIN_LABELS=1
TEST_DATA=2
TEST_LABELS=3

TRUE_LABEL = 0
PREDICTED_LABEL = 1

# In[620]:

def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


# In[621]:

all_features = []
all_labels = []

for profession in sorted_keys:
    choices = np.random.choice(len(data[profession]), TRAIN_CAP, replace=False)
    
    rand_data = [data[profession][i] for i in choices]
    all_features += rand_data
    all_labels += [profession for _ in range(len(rand_data))]

choices = list(range(len(all_features)))
shuffle(choices)

all_features = [all_features[i] for i in choices]
all_labels = [all_labels[i] for i in choices]
index = 0
print("Total number of samples:", len(all_features))

print("Splitting data into training and testing sets")
datasets = [[[],[],[],[]] for _ in range(NUM_FOLDS)]
for train, test in skf.split(all_features, all_labels):
    print(len(train), len(test))
    datasets[index][TRAIN_DATA] = [all_features[i] for i in train]
    datasets[index][TRAIN_LABELS] = [all_labels[i] for i in train]
    datasets[index][TEST_DATA] = [all_features[i] for i in test]
    datasets[index][TEST_LABELS] = [all_labels[i] for i in test]
    
    for li1 in datasets[index][TEST_DATA]:
        for li2 in datasets[index][TRAIN_DATA]:
            if li1 == li2:
                print("ERROR: (DUPLICATE FOUND)", li)
                  
    index += 1
    
no_features = len(pd.DataFrame(all_features).columns)

Reading data in from the csv...
Done reading data from csv
Number of professions to run models on: 10
Running with 10 folds
Professions we are using now: 
Superintendent
Manufacturer
Campaigner
School Principal
Athletics Director
Agent
Data Scientist
Teacher
Software Engineer
Executive Chef
['Superintendent', 'Manufacturer', 'Campaigner', 'School Principal', 'Athletics Director', 'Agent', 'Data Scientist', 'Teacher', 'Software Engineer', 'Executive Chef']
Total number of samples: 9550
Splitting data into training and testing sets
8590 960
8590 960
8590 960
8590 960
8590 960
8600 950
8600 950
8600 950
8600 950
8600 950


In [3]:
tuned_parameters = {}
# Only for randomised hyper parameter tuning
tuned_parameters['KNN'] = {'n_neighbors' : list(range(10,200,5)), 
                           'leaf_size' : list(range(2, 200, 5)),
                           'p' : [1,2]
                          } 
tuned_parameters['random_forest'] = {'n_estimators' : list(range(10, 200, 5)),
                                     'min_samples_leaf' : list(range(10, 200, 3)), 
                                     'criterion' : ['gini', 'entropy'], 
                                     'min_samples_split' : list(range(2, 200, 5))
                                    }
tuned_parameters['gradient_boosting'] = {'learning_rate' : [0.0001, 0.001, 0.1], 
                                         'n_estimators' : list(range(20, 200, 2)), 
                                         'max_features' : list(range(2, no_features))
                                        }
tuned_parameters['xgboost'] = {'n_estimators' : list(range(2, 300, 2)), 
                               'max_depth' : list(range(2, 100, 2)), 
                               'learning_rate' : [0.0001, 0.001, 0.01, 0.1],
                               'subsample':[0.6,0.7,0.75,0.8,0.85,0.9,0.95,1.0]
                              }

# clfs = defaultdict(list)
clfs = {}
confusion_matrices = defaultdict(list)
clfs['random_forest'] = RandomizedSearchCV(RandomForestClassifier(), 
                                           tuned_parameters['random_forest'], 
                                           n_jobs = -1, 
                                           n_iter=500,
                                           cv = 5,
                                           verbose = 1)
clfs['gradient_boosting'] = RandomizedSearchCV(GradientBoostingClassifier(), 
                                              tuned_parameters['gradient_boosting'], 
                                              n_jobs = -1,
                                              n_iter=500,
                                              cv = 5,
                                              verbose = 1)
clfs['xgboost'] = RandomizedSearchCV(estimator = XGBClassifier(), 
                                     param_distributions = tuned_parameters['xgboost'], 
                                     n_jobs = -1,
                                     n_iter=500,
                                     cv = 5,
                                     verbose = 1)
clfs['logistic_regression'] = LogisticRegressionCV(Cs = [0.0001, 0.001, 0.1, 1.0, 10.0, 100, 1000],
                                                   fit_intercept=True, 
                                                   n_jobs = -1,
                                                   max_iter = 500,
                                                   cv = 5,
                                                   verbose = 1)
clfs['KNN'] = RandomizedSearchCV(KNeighborsClassifier(), 
                                 tuned_parameters['KNN'], 
                                 n_jobs = -1, 
                                 n_iter=500,
                                 cv = 5,
                                 verbose = 1)

true_labels = {}
predicted_labels = {}
scores = {}
for classifier in clfs:
    scores[classifier] = [[] for _ in range(NUM_FOLDS)]
    true_labels[classifier] = [[] for _ in range(NUM_FOLDS)]
    predicted_labels[classifier] = [[] for _ in range(NUM_FOLDS)]

In [4]:
print("Training and predicting for all models: ")
trained_clfs = [[] for _ in range(NUM_FOLDS)]
for i, dataset in enumerate(datasets):
    training_data = dataset[TRAIN_DATA]
    training_labels = dataset[TRAIN_LABELS]
    testing_data = dataset[TEST_DATA]
    testing_labels = dataset[TEST_LABELS]
    
    ## initialise classifiers from list
    trained_clfs[i] = copy.deepcopy(clfs)

    for classifier in trained_clfs[i]:
        print("--> Training", classifier,", fold: ",i)
        trained_clfs[i][classifier].fit(np.array(training_data), training_labels)
        predicted_labels[classifier][i] = trained_clfs[i][classifier].predict(np.array(testing_data))
        true_labels[classifier][i] = testing_labels
        prec, rec, f1, _ = precision_recall_fscore_support(testing_labels, predicted_labels[classifier][i], average='macro')
        scores[classifier][i] = [prec, rec, f1]

Training and predicting for all models: 
Training lr and in iteration:  0
Training KNN and in iteration:  0
Training lr and in iteration:  1
Training KNN and in iteration:  1
Training lr and in iteration:  2
Training KNN and in iteration:  2
Training lr and in iteration:  3
Training KNN and in iteration:  3
Training lr and in iteration:  4
Training KNN and in iteration:  4
Training lr and in iteration:  5
Training KNN and in iteration:  5
Training lr and in iteration:  6
Training KNN and in iteration:  6
Training lr and in iteration:  7
Training KNN and in iteration:  7
Training lr and in iteration:  8
Training KNN and in iteration:  8
Training lr and in iteration:  9
Training KNN and in iteration:  9


In [5]:
all_scores = {}
names = []
for classifier in scores:
    score = np.array([0,0,0])
    for i in range(len(scores[classifier])):
        score = score + np.array(scores[classifier][i])/NUM_FOLDS
    all_scores[classifier] = score  

for classifier in all_scores:
    print(classifier, all_scores[classifier])

lr [0.46676021 0.48598355 0.45449771]
KNN [0.48219529 0.4890011  0.47432454]


In [6]:
dill.dump_session(OUTPUT_FILE)
print("Successfully completed everything!")

Successfully completed everything!


10