In [1]:
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
in_file = "../../Features Extracted/UCMerced/Finetune/Single/UCM-VGGCaffe-refc8.csv"
out_file = "../results/" + in_file.split("/")[-1].split(".csv")[0] + ".npy"
df = pd.read_csv(in_file)

In [3]:
df.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,334,335,336,337,338,339,340,341,342,343
0,agricultural00.tif,0.946235,1.030498,0.898138,1.150152,1.065596,0.917944,0.939433,0.953776,1.08603,...,0.957576,1.089319,1.065341,0.973886,1.045439,1.025247,1.174775,0.954651,1.043507,1.057201
1,agricultural01.tif,0.946507,1.060595,0.878008,1.159675,1.050147,0.923151,0.949911,0.973858,1.089424,...,0.91949,1.086226,1.079319,0.951046,1.031299,0.998456,1.189871,0.943276,1.035779,1.063265
2,agricultural02.tif,0.961766,1.055779,0.903289,1.110943,1.05843,0.941561,0.931594,0.969891,1.132722,...,0.911914,1.067518,1.107916,0.934373,1.036289,1.054119,1.203782,0.946681,1.064551,1.066189
3,agricultural03.tif,0.948614,1.051583,0.876057,1.138046,1.042528,0.919195,0.939893,0.97252,1.07892,...,0.947991,1.070118,1.08218,0.970282,1.033854,1.004663,1.179627,0.948583,1.038661,1.050219
4,agricultural04.tif,0.966402,0.994531,1.029453,1.106425,1.094501,0.932725,0.950367,0.94433,1.036466,...,0.952698,0.987855,1.022561,0.990287,0.97654,1.071111,1.052742,1.051731,0.872228,0.919054


In [4]:
df = df.reindex(np.random.permutation(df.index))

df.head()

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,334,335,336,337,338,339,340,341,342,343
1017,harbor17.tif,0.974355,1.014932,1.00708,0.942941,1.047291,0.949828,0.919423,0.868298,1.019491,...,0.920576,0.934521,0.979613,1.028444,1.049613,1.069313,1.099031,1.025011,1.084233,0.941343
436,buildings36.tif,1.017726,0.885048,1.02472,1.008001,1.030148,0.971964,0.891728,1.033487,0.999824,...,0.996999,0.891553,0.976763,0.880175,1.064472,0.846469,1.048461,1.026634,1.134158,1.013339
743,forest43.tif,0.930541,1.139114,0.931241,1.138047,1.044001,0.927611,1.05828,0.964141,1.016266,...,0.967311,1.058897,0.980935,1.001424,1.063016,0.893706,1.040087,0.903487,1.046708,1.030759
1264,mediumresidential64.tif,0.988911,0.926869,1.082446,0.92508,1.088902,0.928442,0.85425,1.044131,0.897101,...,1.050075,0.891576,1.028022,1.013089,1.106627,0.768938,0.99866,0.985087,1.104827,1.001942
486,buildings86.tif,0.994447,1.001471,0.983422,0.98601,1.021182,0.967756,0.861515,1.032596,0.971749,...,0.987197,0.933525,0.964067,0.973818,0.996211,0.892257,1.0737,0.952329,1.060963,1.02771


In [5]:
X = np.array(df.iloc[:,1:])

import re
def getdigit(text):
    return re.compile('\d+').findall(text)
labels = [i.split(getdigit(i)[0])[0] for i in df.iloc[:,0]]

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y = le.fit_transform(labels)

print "feature data is shape", X.shape
print "label data is shape", y.shape

feature data is shape (2100, 344)
label data is shape (2100,)


In [6]:
sc = StandardScaler()

In [7]:
pipe_svc = Pipeline([('sc1', StandardScaler()),
                    ('clf', SVC(kernel='linear'))])

param_range = np.logspace(-10, 1, 100)
param_grid = [{'clf__C': param_range}]
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, cv=10, n_jobs=14)

In [8]:
skf = StratifiedKFold(y, n_folds=5)


In [9]:
scores = []
for train_index, test_index in skf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    gs.fit(X_train, y_train)
    #print "Done with Grid Search"
    clf = SVC(kernel= 'linear', C = gs.best_params_.values()[0])
    #print "Instiated model"
    #Scaled version
    X_trainsc = sc.fit_transform(X_train)
    X_testsc = sc.transform(X_test)
    #print "Scaled Data"
    clf.fit(X_trainsc, y_train)
    #print "Fitted model"
    s = clf.score(X_testsc, y_test)
    print "Score is {}".format(s)
    scores.append(s)

Score is 0.902380952381
Score is 0.930952380952
Score is 0.888095238095
Score is 0.92619047619
Score is 0.933333333333


In [10]:
np.mean(scores)

0.91619047619047633

In [10]:
scores

[0.86904761904761907,
 0.87380952380952381,
 0.88571428571428568,
 0.8666666666666667,
 0.88571428571428568]

In [11]:
gs.best_params_.values()[0]

0.00278255940220712

In [13]:
np.save(out_file, scores)