/
solution5.py
104 lines (77 loc) · 2.5 KB
/
solution5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
from csv import writer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
import six
import utilities
# Decide read/write mode based on python version
read_mode, write_mode = ('r', 'w') if six.PY2 else ('rt', 'wt')
# Set the path to your consolidated files
path = '/Users/chrysovalantis/Documents/UCY/EPL451/Project'
os.chdir(path)
# File names
ftrain = 'train_consolidation.txt'
ftest = 'test_consolidation.txt'
flabel = 'trainLabels.csv'
fsubmission = 'submission.csv'
labels = utilities.read_labels(flabel)
# Dimensions for train set
ntrain = 10868
nfeature = 16 ** 2 + 1 + 1 # For two_byte_codes, no_que_marks, label
train = utilities.read_train(ntrain, nfeature, labels, ftrain)
X = train[:, :-1]
y = train[:, -1]
del labels
del train
# Parameters for trees
random_state = 5342
n_jobs = 8
verbose = 1
n_estimators = 89
# ExtraTreesClassifier - feature selection
clf1 = ExtraTreesClassifier(criterion='gini', random_state=random_state, n_jobs=n_jobs, verbose=verbose, n_estimators=n_estimators, max_features=None)
clf1.fit(X, y)
X_new = clf1.transform(X, '0.5*median')
X = X_new
# Initialize classifier
clf = KNeighborsClassifier(n_neighbors=20, p=1)
# Start training
print('training started')
############################
# test log loss
print('computing log loss')
kf = cross_validation.KFold(ntrain, n_folds=4)
_logloss = 0.0
for trainIndex, testIndex in kf:
print("TRAIN:", trainIndex, "TEST:", testIndex)
X_train, X_test = X[trainIndex], X[testIndex]
y_train, y_test = y[trainIndex], y[testIndex]
clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)
_logloss += utilities.log_loss(pred, y_test)
print('log loss = ', _logloss/len(kf))
############################
clf.fit(X, y)
print('training completed')
del X
del y
# Dimensions for train set
ntest = 10873
nfeature = 16 ** 2 + 1 # For two_byte_codes, no_que_marks
test, Ids = utilities.read_test(ntest, nfeature, ftest)
test = clf1.transform(test, '1.25*median')
# Predict for whole test set
final_pred = clf.predict_proba(test)
del test
# Writing results to file
with open(fsubmission, write_mode) as f:
fw = writer(f)
# Header preparation
header = ['Id'] + ['Prediction' + str(i) for i in range(1, 10)]
fw.writerow(header)
for t, (Id, pred) in enumerate(zip(Ids, final_pred.tolist())):
fw.writerow([Id] + pred)
if (t + 1) % 1000 == 0:
print(t + 1, 'prediction written')
print('all done!')