In [1]:
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from statistics import mean

from sklearn.linear_model import LogisticRegression as Linear
from sklearn.svm import SVC as SVM
from sklearn.tree import DecisionTreeClassifier as DTree
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

SEED = 1031
random.seed(SEED)
np.random.seed(SEED)

In [2]:
data = pd.read_csv('standard.csv')
data = data.set_index('SRCID')
data

Unnamed: 0_level_0,Mode,A1,A2,undecided_voter,A3,A4a,A4b,A4c,A4d,A4e,...,F2a,F2b,F2c,p_gender_sdc,p_age_group_sdc,p_education_sdc,p_state_sdc,StateMap,A4F2_agg,opinionated
SRCID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1,1,0,False,8,1,1,1,1,1,...,1,2,1,2,4,3,1,1,-0.611111,False
3,1,1,13,False,7,1,2,1,0,1,...,4,3,3,2,4,1,4,4,-0.388889,True
4,1,1,0,False,7,1,1,1,0,1,...,1,1,1,2,4,3,5,5,-0.527778,False
6,1,1,2,False,6,2,2,1,1,1,...,5,1,1,2,4,3,4,4,7.750000,False
7,1,1,0,False,5,1,2,1,1,0,...,0,0,0,2,4,3,1,1,-0.277778,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3581,0,1,0,False,4,0,1,0,0,0,...,1,0,1,2,2,1,2,2,-0.277778,True
3583,0,1,2,False,9,0,1,0,0,0,...,1,2,1,2,4,3,5,5,-0.444444,True
3585,0,0,24,True,10,0,2,0,0,0,...,1,1,2,1,3,1,3,3,-0.388889,True
3587,0,1,0,False,9,1,1,0,1,0,...,1,1,1,2,3,2,5,5,-0.416667,True


In [3]:
target = 'opinionated'

In [4]:
cols = [i for i in data.keys() if i not in (target)]
inputs = data[cols]
inputs

Unnamed: 0_level_0,Mode,A1,A2,undecided_voter,A3,A4a,A4b,A4c,A4d,A4e,...,F1h,F2a,F2b,F2c,p_gender_sdc,p_age_group_sdc,p_education_sdc,p_state_sdc,StateMap,A4F2_agg
SRCID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1,1,0,False,8,1,1,1,1,1,...,2,1,2,1,2,4,3,1,1,-0.611111
3,1,1,13,False,7,1,2,1,0,1,...,1,4,3,3,2,4,1,4,4,-0.388889
4,1,1,0,False,7,1,1,1,0,1,...,2,1,1,1,2,4,3,5,5,-0.527778
6,1,1,2,False,6,2,2,1,1,1,...,0,5,1,1,2,4,3,4,4,7.750000
7,1,1,0,False,5,1,2,1,1,0,...,0,0,0,0,2,4,3,1,1,-0.277778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3581,0,1,0,False,4,0,1,0,0,0,...,1,1,0,1,2,2,1,2,2,-0.277778
3583,0,1,2,False,9,0,1,0,0,0,...,0,1,2,1,2,4,3,5,5,-0.444444
3585,0,0,24,True,10,0,2,0,0,0,...,1,1,1,2,1,3,1,3,3,-0.388889
3587,0,1,0,False,9,1,1,0,1,0,...,0,1,1,1,2,3,2,5,5,-0.416667


In [5]:
cols = [target]
targets = data[cols]
targets

Unnamed: 0_level_0,opinionated
SRCID,Unnamed: 1_level_1
2,False
3,True
4,False
6,False
7,True
...,...
3581,True
3583,True
3585,True
3587,True


In [6]:
def train(model, train_idx, test_idx):
    model.fit(inputs.iloc[train_idx], targets.iloc[train_idx].values.ravel())
    outputs = model.predict(inputs.iloc[test_idx])
    auroc = roc_auc_score(targets.iloc[test_idx], outputs)
    f1 = f1_score(targets.iloc[test_idx], outputs)
    cm = confusion_matrix(targets.iloc[test_idx], outputs)
    return auroc, f1, cm

In [7]:
kf = KFold(n_splits=10, shuffle=True)
ret = defaultdict(list)
for train_idx, test_idx in kf.split(data.values):
    ret['linear'].append(train(Linear(max_iter=100000), train_idx, test_idx))
    ret['svm'].append(train(SVM(max_iter=100000), train_idx, test_idx))
    ret['dtree'].append(train(DTree(), train_idx, test_idx))
    ret['mlp'].append(train(MLP(max_iter=100000), train_idx, test_idx))
result = {}
for k, v in ret.items():
    v = tuple(zip(*v))
    result[k] = (mean(v[0]), mean(v[1]), np.stack(v[2]).sum(0))
result

In [9]:
result = {}
for k, v in ret.items():
    v = tuple(zip(*v))
    result[k] = (mean(v[0]), mean(v[1]), np.stack(v[2]).sum(0))
result

{'linear': (0.7985620465830157,
  0.7473747832744667,
  array([[1699,  199],
         [ 344,  819]])),
 'svm': (0.8154012351099321,
  0.7716086584817999,
  array([[1820,   78],
         [ 378,  785]])),
 'dtree': (1.0,
  1.0,
  array([[1898,    0],
         [   0, 1163]])),
 'mlp': (0.9042147113761796,
  0.8847996283277143,
  array([[1804,   94],
         [ 164,  999]]))}