In [1]:
from sklearn import svm, metrics
from sklearn.neighbors import KNeighborsClassifier
import sklearn.model_selection as ms
import pandas as pd

## Importing and formatting data

### Importing data

In [2]:
df = pd.read_csv("./data/dr/train.csv")
df_test = pd.read_csv("./data/dr/test.csv")
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Subsetting data to train faster
Don't run this cell if you want to train on all the data.

In [3]:
df = df.sample(frac=1)
df = df.head(5000)

### Creating data and labels

In [4]:
x = df.drop("label", axis=1)
y = df["label"]
test = df_test.as_matrix()

### Break into train and validation sets

In [5]:
x_train, x_val, y_train, y_val = ms.train_test_split(x, y, test_size=0.2, random_state=0)

## Model building

### Train KNearestNeighbors model

In [6]:
classifier = KNeighborsClassifier()
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

### Evaluate model

In [7]:
predicted = classifier.predict(x_val)
print (metrics.classification_report(predicted, y_val))

             precision    recall  f1-score   support

          0       0.99      0.94      0.96        96
          1       1.00      0.84      0.92       141
          2       0.84      0.98      0.91        83
          3       0.93      0.91      0.92       106
          4       0.92      0.93      0.93       100
          5       0.88      0.91      0.89        75
          6       0.98      0.98      0.98       110
          7       0.95      0.93      0.94       113
          8       0.79      0.96      0.87        80
          9       0.93      0.93      0.93        96

avg / total       0.93      0.93      0.93      1000



### Train and evaluate support vector machine (SVM)

In [8]:
classifier = svm.SVC(gamma=0.001, kernel='linear')
#Todo: put in lines to train model, classify validation set, and print out metrics

### Perform cross validation of the svm model

In [9]:
from sklearn.model_selection import cross_val_predict

preds = cross_val_predict(classifier, X=x, y=y, cv=3)
print (metrics.classification_report(y, preds, digits=4))
print (metrics.confusion_matrix(y, preds))

             precision    recall  f1-score   support

          0     0.9287    0.9788    0.9531       519
          1     0.9091    0.9774    0.9420       532
          2     0.8779    0.8882    0.8830       510
          3     0.8497    0.8497    0.8497       519
          4     0.8612    0.9170    0.8882       494
          5     0.8614    0.8460    0.8536       448
          6     0.9549    0.9376    0.9462       497
          7     0.9183    0.9084    0.9134       557
          8     0.9328    0.8094    0.8667       446
          9     0.8789    0.8347    0.8562       478

avg / total     0.8976    0.8972    0.8966      5000

[[508   0   3   1   2   3   1   0   1   0]
 [  0 520   4   2   0   1   0   0   3   2]
 [  6   6 453  10  13   3   6   8   5   0]
 [  2   8  16 441   1  29   4   5   8   5]
 [  1   6   5   1 453   0   4   6   1  17]
 [ 10   7   2  29   6 379   5   1   4   5]
 [  9   1   7   1   6   5 466   0   1   1]
 [  1   5  14   3   8   1   0 506   2  17]
 [  2  15   8  23

## Write submission files
Output path may need to be modified before running this cell.

In [None]:
predict_test = classifier.predict(test)
out = zip(range(len(test)), predict_test)
with open("./data/solution.csv", 'w') as g:
    g.write("ImageId,Label\n")
    for id, cat in out:
        g.write(str(id + 1) + "," + str(cat) + "\n")