In [1]:
%matplotlib inline

from pathlib import Path

import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import pandas as pd
from skimage import io
from tqdm import tqdm_notebook as tqdm

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA

In [2]:
# Data from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#mnist

def load_xy(folder, pbar=True):
    with open(folder / 'data', 'r') as f:
        data = f.readlines()
    if pbar:
        data = tqdm(data)
    
    xs = np.zeros((len(data), 780), dtype=np.float32)
    ys = np.zeros(len(data), dtype=np.int32)
    for i, line in enumerate(data):
        ts = line.split()
        ys[i] = int(ts[0])
        for t in ts[1:]:
            idx = t.find(':')
            xs[i][int(t[:idx]) - 1] = int(t[idx + 1:])
    
    return xs, ys

data = Path('../data/')
x_train, y_train = load_xy(data / 'mnist')
x_val, y_val = load_xy(data / 'mnist.t')







In [7]:
%%time

pca = PCA(n_components=20)
pca.fit(x_train)
print(pca.explained_variance_ratio_.cumsum())
xt = pca.transform(x_train)
xv = pca.transform(x_val)

svc = SVC(C=1.0, kernel='poly', verbose=True)
svc.fit(xt, yt)
print(svc.score(xv, yv))

[ 0.09704798  0.16800828  0.22970007  0.28359506  0.33228377  0.37540674
  0.40812653  0.43696591  0.46458662  0.48815697  0.50924915  0.52947932
  0.54663777  0.56355911  0.5793457   0.59417546  0.60742122  0.62019002
  0.63206053  0.64358664]
[LibSVM]0.9698
CPU times: user 30.3 s, sys: 18 s, total: 48.3 s
Wall time: 31.5 s


In [8]:
%%time
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
print(rf.score(x_val, y_val))

0.9465
CPU times: user 4.9 s, sys: 2.73 ms, total: 4.9 s
Wall time: 4.93 s
