In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

Use SVM to classify data from "SVM_data_dna.csv" file. Optimize hyperparameters using grid search.

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('./../../Data/SVM_data_dna.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,142,143,144,145,146,147,148,149,150,151
0,0,G,T,C,A,T,G,A,T,C,...,A,T,G,A,A,T,G,C,C,0.0
1,1,T,C,G,G,A,G,G,A,A,...,T,A,A,A,C,T,T,C,T,1.0
2,2,G,T,C,A,C,A,T,G,A,...,A,T,A,A,C,A,G,G,C,0.0
3,3,T,T,T,T,C,A,A,G,C,...,A,A,T,C,C,T,G,A,A,0.0
4,4,T,T,A,C,A,T,T,C,T,...,T,A,A,G,G,A,A,A,T,1.0


In [5]:
data.drop('Unnamed: 0',inplace=True,axis = 1)

In [6]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,142,143,144,145,146,147,148,149,150,151
0,G,T,C,A,T,G,A,T,C,C,...,A,T,G,A,A,T,G,C,C,0.0
1,T,C,G,G,A,G,G,A,A,C,...,T,A,A,A,C,T,T,C,T,1.0
2,G,T,C,A,C,A,T,G,A,T,...,A,T,A,A,C,A,G,G,C,0.0
3,T,T,T,T,C,A,A,G,C,T,...,A,A,T,C,C,T,G,A,A,0.0
4,T,T,A,C,A,T,T,C,T,T,...,T,A,A,G,G,A,A,A,T,1.0


In [7]:
data = data.head(3000)

In [8]:
Y = data['151']

In [9]:
X = data.drop('151',axis=1)

In [10]:
X = pd.get_dummies(X,columns=None,drop_first=True)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=58)

In [13]:
svr = SVC(gamma='auto')

In [14]:
svr.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [15]:
svr.score(X_test,y_test)

0.6494949494949495

In [16]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

In [17]:
clf = GridSearchCV(svr, parameters)

In [18]:
clf.get_params()

{'cv': 'warn',
 'error_score': 'raise-deprecating',
 'estimator__C': 1.0,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'auto',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False),
 'iid': 'warn',
 'n_jobs': None,
 'param_grid': {'kernel': ('linear', 'rbf'), 'C': [1, 10]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [19]:
clf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [20]:
clf.score(X_test,y_test)

0.6494949494949495

# Count vectorizer 

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
data_test = data.drop('151',axis=1)

In [23]:
text = [' '.join(data_test.loc[i].values) for i in range(data_test.shape[0])]

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np

def tokenizer(s):
    width = 1
    return [s[i:i+width] for i in range(len(s)-width+1)]

def count_chunks(sequence_list):
    vectorizer = CountVectorizer(tokenizer=tokenizer)
    X = vectorizer.fit_transform(sequence_list)
    #     counts = (X.toarray()>0).astype(int).sum(axis=0)
    counts  = X.toarray()
    return vectorizer.get_feature_names(), counts
def count_chunks_tfidf(sequence_list):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer)
    X = vectorizer.fit_transform(sequence_list)
    #     counts = (X.toarray()>0).astype(int).sum(axis=0)
    counts  = X.toarray()
    return vectorizer.get_feature_names(), counts

In [34]:
down_names, down_counts = count_chunks(text)
# not_down_names, not_down_counts = count_chunks(not_down_list)

In [35]:
down_counts

array([[150,  53,  35,  28,  35],
       [150,  47,  31,  25,  48],
       [150,  47,  32,  35,  37],
       ...,
       [150,  58,  25,  30,  38],
       [150,  48,  39,  26,  38],
       [150,  40,  33,  37,  41]])

In [36]:
down_names

[' ', 'a', 'c', 'g', 't']

In [37]:
down_counts[:,1:]

array([[53, 35, 28, 35],
       [47, 31, 25, 48],
       [47, 32, 35, 37],
       ...,
       [58, 25, 30, 38],
       [48, 39, 26, 38],
       [40, 33, 37, 41]])

In [38]:
X_train, X_test, y_train, y_test = train_test_split(down_counts[:,1:], data['151'].head(3000), test_size=0.33, random_state=54)

In [39]:
clf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [40]:
clf.score(X_test,y_test)

0.6777777777777778

#TfidfVectorizer

In [41]:
down_names, down_counts = count_chunks_tfidf(text)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(down_counts[:,1:], data['151'].head(3000), test_size=0.33, random_state=54)

In [43]:
clf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [44]:
clf.score(X_test,y_test)

0.6777777777777778

In [45]:
clf = GridSearchCV(svr, parameters)

In [46]:
clf.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [47]:
clf.score(X_test,y_test)

0.6777777777777778

# I have took only the part of the data so the accuracy isn't so high, but as we see using countizers, or using our data rows as sentences and counting the number of each letter in it, and then using the data of numbers, we can some how fit our model occurding only numbers. In some sense it made accuracy better.