## SVM on Amazon Food Reviews

DataSet Source: https://www.kaggle.com/snap/amazon-fine-food-reviews

In [1]:
#Importing all Necessory Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import cross_validation
import warnings
warnings.filterwarnings('ignore')



In [2]:
#Loding Bow data:
import pickle
with open('train_bow.pickle','rb') as handle:
    train_bow = pickle.load(handle)
with open('test_bow.pickle','rb') as handle:
    test_bow = pickle.load(handle)

In [3]:
#Loding Tfidf data:
import pickle
with open('train_tfidf.pickle','rb') as handle:
    train_tfidf = pickle.load(handle)
with open('test_tfidf.pickle','rb') as handle:
    test_tfidf = pickle.load(handle)

In [4]:
#Loding Avg Word2Vec data:
import pickle
with open('train_avg_word2vec.pickle','rb') as handle:
    train_avg_word2vec = pickle.load(handle)
with open('test_avg_word2vec.pickle','rb') as handle:
    test_avg_word2vec = pickle.load(handle)

In [5]:
#Loding Avg Word2Vec data:
import pickle
with open('train_tfidf_word2vec.pickle','rb') as handle:
    train_tfidf_word2vec = pickle.load(handle)
with open('test_tfidf_word2vec.pickle','rb') as handle:
    test_tfidf_word2vec = pickle.load(handle)

In [6]:
with open('y_train.pickle','rb') as handle:
    y_train = pickle.load(handle)
with open('y_test.pickle','rb') as handle:
    y_test = pickle.load(handle)

In [7]:
with open('y_train_w.pickle','rb') as handle:
    y_train_w = pickle.load(handle)
with open('y_test_w.pickle','rb') as handle:
    y_test_w = pickle.load(handle)

In [8]:
from sklearn.preprocessing import normalize
#Normalizing each feature.
#After normalization,all the feature values lies between 0 and 1.
#Bow features
train_bow_normalize = normalize(train_bow, axis=0)
test_bow_normalize = normalize(test_bow, axis=0)
#Tfidf features
train_tfidf_normalize = normalize(train_tfidf, axis=0)
test_tfidf_normalize = normalize(test_tfidf, axis=0)
#Avg word2vec features
train_avgw2v_normalize = normalize(train_avg_word2vec, axis=0)
test_avgw2v_normalize = normalize(test_avg_word2vec, axis=0)
#Tfidf weighted word2vec features
train_tfidfw2v_normalize = normalize(train_tfidf_word2vec, axis=0)
test_tfidfw2v_normalize = normalize(test_tfidf_word2vec, axis=0)

### Featurization:Bag of Words

#### GridSearchCV

In [9]:
#Hyper Parameter tuning with GridSearchCV
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'C':[10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]}, {'gamma':[10**-2, 10**-1, 10**0, 10**1, 10**2]}]
#Default RBF kernal.
model = GridSearchCV(SVC(), tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_bow_normalize, y_train)
print(model.best_estimator_)
print(model.score(test_bow_normalize, y_test))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.8343333333333334


#### RandomizedSearchCV

In [10]:
#Hyper Parameter tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
tuned_parameters = {'C':randint(10**-3, 10**3), 'gamma':randint(10**-2, 10**2)}
#Default RBF Kernal.
model = RandomizedSearchCV(SVC(), param_distributions = tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_bow_normalize, y_train)
print(model.best_estimator_)
print(model.score(test_bow_normalize, y_test))

SVC(C=547, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.833


### Featurization:Tfidf

#### GridSearchCV

In [11]:
#Hyper Parameter tuning with GridSearchCV
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'C':[10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]}, {'gamma':[10**-2, 10**-1, 10**0, 10**1, 10**2]}]
#Default RBF kernal.
model = GridSearchCV(SVC(), tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_tfidf_normalize, y_train)
print(model.best_estimator_)
print(model.score(test_tfidf_normalize, y_test))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.832


#### RandomizedSearchCV

In [12]:
#Hyper Parameter tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
tuned_parameters = {'C':randint(10**-3, 10**3), 'gamma':randint(10**-2, 10**2)}
#Default RBF Kernal.
model = RandomizedSearchCV(SVC(), param_distributions = tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_tfidf_normalize, y_train)
print(model.best_estimator_)
print(model.score(test_tfidf_normalize, y_test))

SVC(C=296, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=53, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.8296666666666667


### Featurization:Avg Word2Vec

#### GridSearchCV

In [13]:
#Hyper Parameter tuning with GridSearchCV
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'C':[10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]}, {'gamma':[10**-2, 10**-1, 10**0, 10**1, 10**2]}]
#Default RBF kernal.
model = GridSearchCV(SVC(), tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_avgw2v_normalize, y_train_w)
print(model.best_estimator_)
print(model.score(test_avgw2v_normalize, y_test_w))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=100, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.8296666666666667


#### RandomizedSearchCV

In [14]:
#Hyper Parameter tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
tuned_parameters = {'C':randint(10**-3, 10**3), 'gamma':randint(10**-2, 10**2)}
#Default RBF Kernal.
model = RandomizedSearchCV(SVC(), param_distributions = tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_avgw2v_normalize, y_train_w)
print(model.best_estimator_)
print(model.score(test_avgw2v_normalize, y_test_w))

SVC(C=91, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=83, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.8296666666666667


### Featurization:Tfidf Weighted Word2Vec

#### GridSearchCV

In [15]:
#Hyper Parameter tuning with GridSearchCV
from sklearn.grid_search import GridSearchCV
tuned_parameters = [{'C':[10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3]}, {'gamma':[10**-2, 10**-1, 10**0, 10**1, 10**2]}]
#Default RBF kernal.
model = GridSearchCV(SVC(), tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_tfidfw2v_normalize, y_train_w)
print(model.best_estimator_)
print(model.score(test_avgw2v_normalize, y_test_w))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=100, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.8296666666666667


#### RandomizedSearchCV

In [16]:
#Hyper Parameter tuning with RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
tuned_parameters = {'C':randint(10**-3, 10**3), 'gamma':randint(10**-2, 10**2)}
#Default RBF Kernal.
model = RandomizedSearchCV(SVC(), param_distributions = tuned_parameters, scoring='accuracy',n_jobs=-1)
model.fit(train_tfidfw2v_normalize, y_train_w)
print(model.best_estimator_)
print(model.score(test_tfidfw2v_normalize, y_test_w))

SVC(C=11, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=6, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
0.8296666666666667


#### Results: 

* Bag of Words(BOW)------------->C(HyperParameter)=1 ------->Acc=83.4%
* Term Frequency_Inverse Document frequency(TFIDF)----->C=1 ------->Acc=83.2%
* Average Word2Vec---------------->C=1 ------------->Acc=82.9%
* Tfidf Weighted Word2Vec------------>C=1 -------->Acc=82.9%
* Bag of words featurization gives best Accuracy.