In [96]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [97]:
train_path =  os.path.abspath('data/iris_train.csv')
test_path =  os.path.abspath('data/iris_test.csv')

In [98]:
train_pd = pd.read_csv(train_path , usecols=[1, 2, 3, 4, 5])
test_pd = pd.read_csv(test_path, usecols=[1, 2, 3, 4])

In [99]:
train_pd.shape


(110, 5)

In [100]:
train_pd.info


<bound method DataFrame.info of        0    1    2    3           labels
0    6.3  2.9  5.6  1.8   Iris-virginica
1    5.6  3.0  4.5  1.5  Iris-versicolor
2    7.9  3.8  6.4  2.0   Iris-virginica
3    6.5  2.8  4.6  1.5  Iris-versicolor
4    5.0  2.0  3.5  1.0  Iris-versicolor
5    5.0  3.6  1.4  0.2      Iris-setosa
6    5.5  2.5  4.0  1.3  Iris-versicolor
7    7.7  3.0  6.1  2.3   Iris-virginica
8    4.8  3.4  1.6  0.2      Iris-setosa
9    5.1  3.5  1.4  0.2      Iris-setosa
10   6.0  2.7  5.1  1.6  Iris-versicolor
11   6.7  3.1  4.4  1.4  Iris-versicolor
12   6.7  3.0  5.0  1.7  Iris-versicolor
13   5.5  2.6  4.4  1.2  Iris-versicolor
14   5.2  3.5  1.5  0.2      Iris-setosa
15   7.2  3.0  5.8  1.6   Iris-virginica
16   6.6  3.0  4.4  1.4  Iris-versicolor
17   7.1  3.0  5.9  2.1   Iris-virginica
18   5.6  2.7  4.2  1.3  Iris-versicolor
19   5.4  3.0  4.5  1.5  Iris-versicolor
20   4.9  3.0  1.4  0.2      Iris-setosa
21   5.1  3.5  1.4  0.3      Iris-setosa
22   5.0  2.3  3.3  1.0  

In [101]:
train_pd.head()


Unnamed: 0,0,1,2,3,labels
0,6.3,2.9,5.6,1.8,Iris-virginica
1,5.6,3.0,4.5,1.5,Iris-versicolor
2,7.9,3.8,6.4,2.0,Iris-virginica
3,6.5,2.8,4.6,1.5,Iris-versicolor
4,5.0,2.0,3.5,1.0,Iris-versicolor


In [102]:
train_pd["labels"]

0       Iris-virginica
1      Iris-versicolor
2       Iris-virginica
3      Iris-versicolor
4      Iris-versicolor
5          Iris-setosa
6      Iris-versicolor
7       Iris-virginica
8          Iris-setosa
9          Iris-setosa
10     Iris-versicolor
11     Iris-versicolor
12     Iris-versicolor
13     Iris-versicolor
14         Iris-setosa
15      Iris-virginica
16     Iris-versicolor
17      Iris-virginica
18     Iris-versicolor
19     Iris-versicolor
20         Iris-setosa
21         Iris-setosa
22     Iris-versicolor
23         Iris-setosa
24      Iris-virginica
25      Iris-virginica
26         Iris-setosa
27         Iris-setosa
28         Iris-setosa
29         Iris-setosa
            ...       
80         Iris-setosa
81      Iris-virginica
82     Iris-versicolor
83      Iris-virginica
84      Iris-virginica
85     Iris-versicolor
86         Iris-setosa
87         Iris-setosa
88      Iris-virginica
89     Iris-versicolor
90      Iris-virginica
91      Iris-virginica
92      Iri

In [103]:
y_train = train_pd['labels']
y_train.value_counts()

Iris-versicolor    38
Iris-setosa        37
Iris-virginica     35
Name: labels, dtype: int64

In [104]:
y, mapping = pd.factorize(y_train)

In [105]:
y

array([0, 1, 0, 1, 1, 2, 1, 0, 2, 2, 1, 1, 1, 1, 2, 0, 1, 0, 1, 1, 2, 2,
       1, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 0, 2, 0, 2, 1,
       2, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 1, 0, 2, 0, 1, 0, 0, 1, 2, 2,
       0, 1, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 2, 0, 1, 2, 0, 2])

In [106]:
mapping

Index([u'Iris-virginica', u'Iris-versicolor', u'Iris-setosa'], dtype='object')

In [107]:
X_train = train_pd.drop(labels = ["labels"],axis = 1) 

In [108]:
X_train

Unnamed: 0,0,1,2,3
0,6.3,2.9,5.6,1.8
1,5.6,3.0,4.5,1.5
2,7.9,3.8,6.4,2.0
3,6.5,2.8,4.6,1.5
4,5.0,2.0,3.5,1.0
5,5.0,3.6,1.4,0.2
6,5.5,2.5,4.0,1.3
7,7.7,3.0,6.1,2.3
8,4.8,3.4,1.6,0.2
9,5.1,3.5,1.4,0.2


In [109]:
X = np.array(X_train)

In [110]:
X

array([[6.3, 2.9, 5.6, 1.8],
       [5.6, 3. , 4.5, 1.5],
       [7.9, 3.8, 6.4, 2. ],
       [6.5, 2.8, 4.6, 1.5],
       [5. , 2. , 3.5, 1. ],
       [5. , 3.6, 1.4, 0.2],
       [5.5, 2.5, 4. , 1.3],
       [7.7, 3. , 6.1, 2.3],
       [4.8, 3.4, 1.6, 0.2],
       [5.1, 3.5, 1.4, 0.2],
       [6. , 2.7, 5.1, 1.6],
       [6.7, 3.1, 4.4, 1.4],
       [6.7, 3. , 5. , 1.7],
       [5.5, 2.6, 4.4, 1.2],
       [5.2, 3.5, 1.5, 0.2],
       [7.2, 3. , 5.8, 1.6],
       [6.6, 3. , 4.4, 1.4],
       [7.1, 3. , 5.9, 2.1],
       [5.6, 2.7, 4.2, 1.3],
       [5.4, 3. , 4.5, 1.5],
       [4.9, 3. , 1.4, 0.2],
       [5.1, 3.5, 1.4, 0.3],
       [5. , 2.3, 3.3, 1. ],
       [5.2, 3.4, 1.4, 0.2],
       [6.4, 2.8, 5.6, 2.1],
       [5.9, 3. , 5.1, 1.8],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5. , 3.5, 1.3, 0.3],
       [5.8, 4. , 1.2, 0.2],
       [5.1, 3.8, 1.6, 0.2],
       [5.4, 3.7, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [5.5, 2.3, 4. , 1.3],
       [5.1, 2

In [111]:
svc = SVC(kernel='linear', C=1E10)
svc.fit(X, y)

SVC(C=10000000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
print 'Accuracy of svc on training set: {:.2f}'.format(svc.score(X_train, y_train))
print('Accuracy of svc on test set: {:.2f}'.format(svc.score(X_test, y_test)))

Accuracy of svc on training set: 1.00
Accuracy of svc on test set: 1.00


In [113]:
X_test = np.array(test_pd)

In [114]:
X_test 

array([[7.2, 3.6, 6.1, 2.5],
       [4.7, 3.2, 1.3, 0.2],
       [4.4, 3.2, 1.3, 0.2],
       [4.5, 2.3, 1.3, 0.3],
       [6.4, 3.2, 4.5, 1.5],
       [6.3, 2.5, 4.9, 1.5],
       [4.6, 3.1, 1.5, 0.2],
       [5.9, 3.2, 4.8, 1.8],
       [5.9, 3. , 4.2, 1.5],
       [6.2, 3.4, 5.4, 2.3],
       [6.1, 2.8, 4. , 1.3],
       [6. , 3. , 4.8, 1.8],
       [6.4, 2.8, 5.6, 2.2],
       [6.4, 2.9, 4.3, 1.3],
       [5.6, 2.9, 3.6, 1.3],
       [4.3, 3. , 1.1, 0.1],
       [6.5, 3. , 5.8, 2.2],
       [6.7, 2.5, 5.8, 1.8],
       [5.8, 2.8, 5.1, 2.4],
       [6.7, 3.1, 5.6, 2.4],
       [5.4, 3.4, 1.7, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [4.8, 3. , 1.4, 0.1],
       [6.3, 2.8, 5.1, 1.5],
       [5.4, 3.9, 1.7, 0.4],
       [5.8, 2.7, 5.1, 1.9],
       [5. , 3.4, 1.5, 0.2],
       [5.8, 2.7, 3.9, 1.2],
       [6.1, 2.6, 5.6, 1.4],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 2.8, 4.7, 1.2],
       [6.9, 3.1, 5.4, 2.1],
       [4.6, 3.2, 1.4, 0.2],
       [5.4, 3.9, 1.3, 0.4],
       [4.9, 2

In [115]:
predicted = svc.predict(X_test)

In [116]:
predicted

array([0, 2, 2, 2, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 2, 0, 0, 0, 0, 2, 2,
       2, 1, 2, 0, 2, 1, 1, 0, 1, 0, 2, 2, 1, 1, 1, 0, 1, 2])

In [117]:
real_prediction = dict(zip(range(3), mapping))
y_pred = np.vectorize(real_prediction.get)(predicted)

In [118]:
y_pred

array(['Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa'], dtype='|S15')

In [123]:
test_predictions = pd.DataFrame()
test_predictions['predictions'] = y_pred
test_predictions.to_csv('submissions/Iris_submission.csv', index=True)