## Initialization

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
model_accuracies = {'LogReg':1, 'DT':1, 'RF':1, 'LinearSVC':1, 'KernelSVC':1, 'NB':1, 'KNN':1}

## Importing the data

In [4]:
dataset = pd.read_csv('amazon_cells_labelled.txt', header = None, delimiter = '\t', quoting = 3)
dataset.shape

(1000, 2)

In [5]:
dataset.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


## Preprocess the Data

In [6]:
corpus = []

In [9]:
for i in range(0, dataset.shape[0]):
    sent = re.sub('[^a-zA-Z]', ' ', dataset[0][i])
    sent = sent.lower()
    sent = sent.split()
    ps = PorterStemmer()
    sent = [ps.stem(word) for word in sent if not word in set(stopwords.words('english'))]
    sent = ' '.join(sent)
    corpus.append(sent)

In [10]:
corpus

['way plug us unless go convert',
 'good case excel valu',
 'great jawbon',
 'tie charger convers last minut major problem',
 'mic great',
 'jiggl plug get line right get decent volum',
 'sever dozen sever hundr contact imagin fun send one one',
 'razr owner must',
 'needless say wast money',
 'wast money time',
 'sound qualiti great',
 'impress go origin batteri extend batteri',
 'two seper mere ft start notic excess static garbl sound headset',
 'good qualiti though',
 'design odd ear clip comfort',
 'highli recommend one blue tooth phone',
 'advis everyon fool',
 'far good',
 'work great',
 'click place way make wonder long mechan would last',
 'went motorola websit follow direct could get pair',
 'bought use kindl fire absolut love',
 'commerci mislead',
 'yet run new batteri two bar three day without charg',
 'bought mother problem batteri',
 'great pocket pc phone combin',
 'own phone month say best mobil phone',
 'think instruct provid help',
 'peopl couldnt hear talk pull earph

## Create Bag of Words Model

In [17]:
cv = CountVectorizer(max_features = 1300)

In [18]:
X = cv.fit_transform(corpus).toarray()

In [19]:
Y = dataset.iloc[:, 1].values

In [20]:
X.shape

(1000, 1300)

In [21]:
Y.shape

(1000,)

## Create Train and Test Data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [23]:
X_train.shape

(800, 1300)

In [24]:
X_test.shape

(200, 1300)

In [25]:
Y_train.shape

(800,)

In [26]:
Y_test.shape

(200,)

In [27]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
0,413
1,387


In [28]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
1,113
0,87


## DecisionTree

In [29]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [30]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [31]:
Y_pred_dt = clf_dt.predict(X_test)

In [32]:
confusion_matrix(Y_pred_dt, Y_test)

array([[66, 34],
       [21, 79]], dtype=int64)

## Random Forest

In [33]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [34]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
Y_pred_rf = clf_rf.predict(X_test)

In [36]:
confusion_matrix(Y_pred_rf, Y_test)

array([[77, 40],
       [10, 73]], dtype=int64)

## Naive Bayes

In [37]:
clf_nb = GaussianNB()

In [38]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [39]:
Y_pred_nb = clf_nb.predict(X_test)

In [40]:
confusion_matrix(Y_pred_nb, Y_test)

array([[54, 20],
       [33, 93]], dtype=int64)

## KNN

In [41]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [42]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [43]:
Y_pred_knn = clf_knn.predict(X_test)

In [44]:
confusion_matrix(Y_pred_knn, Y_test)

array([[55, 31],
       [32, 82]], dtype=int64)

## Logistic Regression

In [45]:
clf_lr = LogisticRegression()

In [46]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [47]:
Y_pred_lr = clf_lr.predict(X_test)

In [48]:
confusion_matrix(Y_pred_lr, Y_test)

array([[74, 32],
       [13, 81]], dtype=int64)

## Linear SVC

In [49]:
clf_lsvc = SVC(kernel = 'linear')

In [50]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [51]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [52]:
confusion_matrix(Y_pred_lsvc, Y_test)

array([[67, 31],
       [20, 82]], dtype=int64)

## Kernel SVC

In [53]:
clf_ksvc = SVC(kernel = 'rbf')

In [54]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [55]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [56]:
confusion_matrix(Y_pred_ksvc, Y_test)

array([[ 87, 113],
       [  0,   0]], dtype=int64)

## Accuracy of Various Models

In [57]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.72499999999999998,
 'KNN': 0.68500000000000005,
 'KernelSVC': 0.435,
 'LinearSVC': 0.745,
 'LogReg': 0.77500000000000002,
 'NB': 0.73499999999999999,
 'RF': 0.75}