# Data Science London - SK-Learn

<hr/>

This is a synthetic data set of 40 features, representing objects from two classes (labeled as 0 or 1). The training set has 1000 samples and the testing set has 9000.

In [71]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
from sklearn import svm, preprocessing, neighbors
from sklearn.model_selection import train_test_split


In [2]:
path = "./data"

In [3]:
train_data_features = os.path.join(path, 'train.csv')
train_data_labels = os.path.join(path, 'trainLabels.csv')
test_data = os.path.join(path, 'test.csv')

In [4]:
df_train = pd.read_csv(train_data_features)
df_train.head()

Unnamed: 0,0.29940251144353242,-1.2266241875260637,1.4984250500215328,-1.1761503610375272,5.2898525545597037,0.20829711393323402,2.4044983672405826,1.5945062220589785,-0.051608163273514231,0.66323431039687908,...,-0.85046544625016463,-0.62298999638261954,-1.8330573433160038,0.29302438506869571,3.5526813410266507,0.71761099417552265,3.3059719748508889,-2.7155588147154619,-2.6824085866346223,0.10105047232890663
0,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.09378,...,-0.81975,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761
1,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.24031,...,-0.604501,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436
2,1.57327,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.022959,1.275598,-3.48011,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397
3,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,0.513906,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332
4,-0.773247,-0.123227,0.047423,-0.210266,10.377793,0.526604,-2.751616,0.315541,0.608603,-0.043421,...,-1.487714,0.79279,-0.540711,0.114115,-0.277477,-0.896411,-2.805207,0.469162,3.614157,0.081689


In [7]:
print("Data Size: ", len(df_train))

Data Size:  999


In [11]:
df_labels = pd.read_csv(train_data_labels)
df_labels.head()

Unnamed: 0,1
0,0
1,0
2,1
3,0
4,1


## Processing Data

In [12]:
df_train.fillna(value=-99999, inplace=True)
df_labels.fillna(value=-99999, inplace=True)

In [13]:
df_train

Unnamed: 0,0.29940251144353242,-1.2266241875260637,1.4984250500215328,-1.1761503610375272,5.2898525545597037,0.20829711393323402,2.4044983672405826,1.5945062220589785,-0.051608163273514231,0.66323431039687908,...,-0.85046544625016463,-0.62298999638261954,-1.8330573433160038,0.29302438506869571,3.5526813410266507,0.71761099417552265,3.3059719748508889,-2.7155588147154619,-2.6824085866346223,0.10105047232890663
0,-1.174176,0.332157,0.949919,-1.285328,2.199061,-0.151268,-0.427039,2.619246,-0.765884,-0.093780,...,-0.819750,0.012037,2.038836,0.468579,-0.517657,0.422326,0.803699,1.213219,1.382932,-1.817761
1,1.192222,-0.414371,0.067054,-2.233568,3.658881,0.089007,0.203439,-4.219054,-1.184919,-1.240310,...,-0.604501,0.750054,-3.360521,0.856988,-2.751451,-1.582735,1.672246,0.656438,-0.932473,2.987436
2,1.573270,-0.580318,-0.866332,-0.603812,3.125716,0.870321,-0.161992,4.499666,1.038741,-1.092716,...,1.022959,1.275598,-3.480110,-1.065252,2.153133,1.563539,2.767117,0.215748,0.619645,1.883397
3,-0.613071,-0.644204,1.112558,-0.032397,3.490142,-0.011935,1.443521,-4.290282,-1.761308,0.807652,...,0.513906,-1.803473,0.518579,-0.205029,-4.744566,-1.520015,1.830651,0.870772,-1.894609,0.408332
4,-0.773247,-0.123227,0.047423,-0.210266,10.377793,0.526604,-2.751616,0.315541,0.608603,-0.043421,...,-1.487714,0.792790,-0.540711,0.114115,-0.277477,-0.896411,-2.805207,0.469162,3.614157,0.081689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,-0.310429,0.826811,-0.952245,0.768850,1.877520,1.320646,1.944609,1.191420,-0.127724,0.070937,...,-0.600411,-0.383792,0.745596,-0.698598,-2.729937,-0.431535,0.372873,1.019092,-2.672811,-0.295141
995,-1.853879,0.246726,0.459921,-2.074267,7.599220,-0.138355,-4.501900,0.630634,-1.590533,-1.112949,...,0.361736,0.240052,-0.856196,-0.072481,-2.935896,0.582411,-2.613407,0.036687,2.809310,4.412567
996,0.912748,-1.734039,-1.047035,0.217573,13.457812,0.162771,-2.250521,2.216161,-0.378326,0.642114,...,1.195896,-1.073806,-2.754369,1.814864,-4.190105,-1.116441,-2.100125,0.061513,0.895536,0.813686
997,2.439780,-0.735511,-0.902426,1.365036,-10.430299,-0.856859,2.686474,0.292035,0.585388,-0.876965,...,2.262326,-0.039488,0.773876,-0.916066,2.604827,-0.649874,-3.423674,0.229748,-2.311088,-3.422217


In [64]:
X = np.array(df_train)
#X = preprocessing.scale(X)
X.shape

(999, 40)

In [65]:
y = np.array(df_labels)
y.shape

(999, 1)

### Split train/test data

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)


## Train Model
We will be experimenting with SVM and KNN. 
For SVM, kernels used: 
- linear
- poly
- rbf
- sigmoid

### SVM

In [67]:
clf = svm.SVR(kernel='rbf')

In [68]:
clf.fit(X_train, y_train.ravel())

SVR()

In [69]:
confidence = clf.score(X_test, y_test)
confidence

0.7120472729165127

### KNN


In [124]:
X = np.array(df_train)
#X = preprocessing.scale(X)
X.shape

(999, 40)

In [125]:
y = np.array(df_labels)
y.shape

(999, 1)

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [127]:
clf_2 = neighbors.KNeighborsClassifier() 

In [128]:
clf_2.fit(X_train, y_train.ravel())


KNeighborsClassifier()

In [129]:
confidence = clf_2.score(X_test, y_test)
confidence

0.94

## Testing 


In [203]:
df_test = pd.read_csv("./data/test.csv")

In [217]:
len(df_test)

8999

In [161]:
df_test

Unnamed: 0,2.8089094884322816,-0.2428941541280098,-0.54642134078742799,0.25516185655651813,1.7497359401586778,-0.030457614747139938,-1.3220705648016891,3.5780706827705329,-0.66757846469285065,-0.88425685382194696,...,-0.26168811207868709,-0.22437538915802702,-1.6756055243798111,-0.47958360985977283,-0.24438814509352236,-0.67235457360991246,0.51786002310577695,0.010664676497809677,-0.41921432660528302,2.8183870993565581
0,-0.374101,0.537669,0.081063,0.756773,0.915231,2.557282,3.703187,1.673835,-0.764122,-1.228040,...,-0.969463,0.574154,-2.200519,-1.612240,0.179031,-2.924596,0.643610,-1.470939,-0.067408,-0.976265
1,-0.088370,0.154743,0.380716,-1.176126,1.699867,-0.258627,-1.384999,1.093584,1.596633,0.230631,...,-0.769885,-0.005143,1.467490,0.483803,-3.542981,0.814561,-1.652948,1.265866,-1.749248,1.773784
2,-0.685635,0.501283,1.873375,0.215224,-3.983468,-0.103637,4.136113,-0.225431,-1.515015,-1.071763,...,0.968609,2.386412,-0.131219,0.285646,2.302069,1.255588,-1.563090,-0.125258,-1.030761,-2.945329
3,0.350867,0.721897,-0.477104,-1.748776,-2.627405,1.075433,4.954253,-3.293501,-0.760369,0.204360,...,0.260553,-2.045650,-2.173227,0.372992,0.450700,-0.211657,1.301359,-0.522164,2.484883,0.039213
4,-0.424627,1.536196,-1.037752,-0.156466,-2.945038,-0.471607,3.494966,-2.763629,0.819540,0.209529,...,-0.727066,-0.809620,4.504230,-0.481077,-2.923646,-0.468034,-0.846214,1.197350,-5.615563,2.049134
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8994,0.171644,-0.806952,-2.045671,0.021156,2.258491,0.429469,0.857187,0.972600,1.707492,1.676370,...,-1.366312,0.276543,-0.732764,0.243930,-1.151233,-0.274298,0.573013,1.109814,-1.905965,1.457601
8995,1.168564,-0.911253,1.685492,0.867183,3.606170,-0.673875,-1.889365,0.411385,-0.206817,-0.705771,...,0.557757,0.379841,-1.474198,-0.322943,1.964519,0.122384,0.678023,2.024129,0.386542,1.104493
8996,0.052274,-1.736558,-0.263699,-0.219329,8.918393,-1.258320,-3.361146,0.893366,-0.631669,1.887286,...,2.117847,-1.050824,0.182872,0.242725,0.670161,0.112752,-3.006949,1.179606,1.156340,-1.218561
8997,1.443659,0.651892,0.550724,-1.146664,2.621641,-0.867143,0.312742,1.078004,-1.212524,-0.028143,...,0.631480,1.186236,-1.098508,1.159658,-1.957241,0.482533,3.777669,-0.424954,1.333374,2.325271


In [164]:
prediction = clf_2.predict(test_data)

In [166]:
prediction[:10]

array([0, 1, 0, 0, 0, 0, 1, 0, 0, 1], dtype=int64)