In [1]:
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from statistics import mean

from sklearn.svm import SVC as SVM
from sklearn.tree import DecisionTreeClassifier as DTree
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix

SEED = 1031
random.seed(SEED)
np.random.seed(SEED)

In [2]:
data = pd.read_csv('3425.csv')
data = data.set_index('SRCID')
data

Unnamed: 0_level_0,Mode,A1,A2,undecided_voter,A3,A4a,A4b,A4c,A4d,A4e,...,F2a,F2b,F2c,p_gender_sdc,p_age_group_sdc,p_education_sdc,p_state_sdc,StateMap,A4F2_agg,opinionated
SRCID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,2,1,False,8,2,2,2,2,2,...,2,3,2,2,4,3,1,1,0.388889,False
3,2,2,24,False,7,2,3,2,1,2,...,5,4,4,2,4,1,4,4,0.611111,True
4,2,2,1,False,7,2,2,2,1,2,...,2,2,2,2,4,3,5,5,0.472222,False
6,2,2,3,False,6,3,3,2,2,2,...,-98,2,2,2,4,3,4,4,8.750000,False
7,2,2,1,False,5,2,3,2,2,1,...,1,1,1,2,4,3,1,1,0.722222,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3581,1,2,1,False,4,1,2,1,1,1,...,2,1,2,2,2,1,2,2,0.722222,True
3583,1,2,3,False,9,1,2,1,1,1,...,2,3,2,2,4,3,5,5,0.555556,True
3585,1,1,-98,True,10,1,3,1,1,1,...,2,2,3,1,3,1,3,3,0.611111,True
3587,1,2,1,False,9,2,2,1,2,1,...,2,2,2,2,3,2,5,5,0.583333,True


In [3]:
target = 'undecided_voter'

In [4]:
cols = [i for i in data.keys() if i not in ((target, 'A2'))]
inputs = data[cols]
inputs

Unnamed: 0_level_0,Mode,A1,A3,A4a,A4b,A4c,A4d,A4e,A4f,B1a,...,F2a,F2b,F2c,p_gender_sdc,p_age_group_sdc,p_education_sdc,p_state_sdc,StateMap,A4F2_agg,opinionated
SRCID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,2,8,2,2,2,2,2,1,2,...,2,3,2,2,4,3,1,1,0.388889,False
3,2,2,7,2,3,2,1,2,1,2,...,5,4,4,2,4,1,4,4,0.611111,True
4,2,2,7,2,2,2,1,2,2,2,...,2,2,2,2,4,3,5,5,0.472222,False
6,2,2,6,3,3,2,2,2,2,2,...,-98,2,2,2,4,3,4,4,8.750000,False
7,2,2,5,2,3,2,2,1,2,2,...,1,1,1,2,4,3,1,1,0.722222,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3581,1,2,4,1,2,1,1,1,2,2,...,2,1,2,2,2,1,2,2,0.722222,True
3583,1,2,9,1,2,1,1,1,3,2,...,2,3,2,2,4,3,5,5,0.555556,True
3585,1,1,10,1,3,1,1,1,1,1,...,2,2,3,1,3,1,3,3,0.611111,True
3587,1,2,9,2,2,1,2,1,1,2,...,2,2,2,2,3,2,5,5,0.583333,True


In [5]:
cols = [target]
targets = data[cols]
targets

Unnamed: 0_level_0,undecided_voter
SRCID,Unnamed: 1_level_1
2,False
3,False
4,False
6,False
7,False
...,...
3581,False
3583,False
3585,True
3587,False


In [6]:
def train(model, train_idx, test_idx):
    model.fit(inputs.iloc[train_idx], targets.iloc[train_idx].values.ravel())
    outputs = model.predict(inputs.iloc[test_idx])
    auroc = roc_auc_score(targets.iloc[test_idx], outputs)
    f1 = f1_score(targets.iloc[test_idx], outputs)
    cm = confusion_matrix(targets.iloc[test_idx], outputs)
    return auroc, f1, cm

In [7]:
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
ret = defaultdict(list)
for train_idx, test_idx in kf.split(data.values):
    ret['ref'].append(train(SVM(max_iter=100000,cache_size=8192), train_idx, test_idx))
    ret['rbf'].append(train(SVM(max_iter=100000,cache_size=8192,class_weight={True:20}), train_idx, test_idx))
    # ret['svm-poly3'].append(train(SVM(max_iter=100000,cache_size=8192,class_weight={True:20},kernel='poly',degree=3), train_idx, test_idx))
    # ret['svm-poly4'].append(train(SVM(max_iter=100000,cache_size=8192,class_weight='balanced',kernel='poly',degree=9), train_idx, test_idx))
    # ret['svm-poly5'].append(train(SVM(max_iter=100000,cache_size=8192,class_weight='balanced',kernel='poly',degree=27), train_idx, test_idx))
    # ret['svm-rbfb'].append(train(SVM(max_iter=100000,cache_size=8192,class_weight={True:20},kernel='rbf'), train_idx, test_idx))
    # ret['svm-rbf'].append(train(SVM(max_iter=100000,cache_size=8192,class_weight='balanced',kernel='rbf'), train_idx, test_idx))
    # ret['dtree'].append(train(DTree(), train_idx, test_idx))
    # ret['mlp'].append(train(MLP(max_iter=100000), train_idx, test_idx))
result = {}
for k, v in ret.items():
    v = tuple(zip(*v))
    result[k] = (mean(v[0]), mean(v[1]), np.stack(v[2]).sum(0))
result

{'ref': (0.4996515298688684,
  0.0,
  array([[2898,    2],
         [ 161,    0]])),
 'rbf': (0.5925262692685235,
  0.1299938401962938,
  array([[1764, 1136],
         [  71,   90]]))}

In [8]:
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
ret = defaultdict(list)
for train_idx, test_idx in kf.split(data.values):
    ret['ref'].append(train(DTree(random_state=SEED,), train_idx, test_idx))
    ret['div'].append(train(DTree(random_state=SEED,class_weight={True:3061/161}, splitter='random',ccp_alpha=0.01), train_idx, test_idx))
    ret['a'].append(train(DTree(random_state=SEED,class_weight={True:3061/161}, splitter='random',ccp_alpha=0.01,min_weight_fraction_leaf=0.1), train_idx, test_idx))
    ret['b'].append(train(DTree(random_state=SEED,class_weight={True:3061/161}, splitter='random',ccp_alpha=0.01,min_weight_fraction_leaf=0.01), train_idx, test_idx))
    ret['c'].append(train(DTree(random_state=SEED,class_weight={True:3061/161}, splitter='random',ccp_alpha=0.01,min_weight_fraction_leaf=0.5), train_idx, test_idx))
    # ret['dtree'].append(train(DTree(), train_idx, test_idx))
    # ret['mlp'].append(train(MLP(max_iter=100000), train_idx, test_idx))
result = {}
for k, v in ret.items():
    v = tuple(zip(*v))
    result[k] = (mean(v[0]), mean(v[1]), np.stack(v[2]).sum(0))
result

{'ref': (0.5000815939677964,
  0.06042417323726098,
  array([[2689,  211],
         [ 149,   12]])),
 'div': (0.6171673404245954,
  0.13663607431927624,
  array([[1290, 1610],
         [  37,  124]])),
 'a': (0.5140364916773368,
  0.1066494650905851,
  array([[ 183, 2717],
         [   8,  153]])),
 'b': (0.6082568157537181,
  0.13099774258498834,
  array([[1351, 1549],
         [  41,  120]])),
 'c': (0.5,
  0.099626143032547,
  array([[   0, 2900],
         [   0,  161]]))}

In [9]:
kf = KFold(n_splits=10, shuffle=True, random_state=SEED)
ret = defaultdict(list)
for train_idx, test_idx in kf.split(data.values):
    ret['ref'].append(train(MLP(max_iter=100000, random_state=SEED), train_idx, test_idx))
    ret['adam'].append(train(MLP(max_iter=100000, solver='adam',alpha=0.001, random_state=SEED,hidden_layer_sizes=20), train_idx, test_idx))
    ret['a'].append(train(MLP(max_iter=100000, solver='adam',alpha=0.001, random_state=SEED,hidden_layer_sizes=20,learning_rate='invscaling'), train_idx, test_idx))
    ret['b'].append(train(MLP(max_iter=100000, solver='adam',alpha=0.001, random_state=SEED,hidden_layer_sizes=20,learning_rate='adaptive'), train_idx, test_idx))
result = {}
for k, v in ret.items():
    v = tuple(zip(*v))
    result[k] = (mean(v[0]), mean(v[1]), np.stack(v[2]).sum(0))
result

{'ref': (0.5133781642689002,
  0.061230454133679946,
  array([[2856,   44],
         [ 154,    7]])),
 'adam': (0.53450247126925,
  0.121497890997891,
  array([[2869,   31],
         [ 148,   13]])),
 'a': (0.53450247126925,
  0.121497890997891,
  array([[2869,   31],
         [ 148,   13]])),
 'b': (0.53450247126925,
  0.121497890997891,
  array([[2869,   31],
         [ 148,   13]]))}