In [1]:
import numpy as np 
import pandas as pd 
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

import nltk
import string
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# TASK 1

In [2]:
df=pd.read_csv('car_reviews.csv')

In [3]:
df

Unnamed: 0,Sentiment,Review
0,Neg,In 1992 we bought a new Taurus and we really ...
1,Neg,The last business trip I drove to San Franci...
2,Neg,My husband and I purchased a 1990 Ford F250 a...
3,Neg,I feel I have a thorough opinion of this truc...
4,Neg,AS a mother of 3 all of whom are still in ca...
...,...,...
1377,Pos,In June we bought the Sony Limited Edition Fo...
1378,Pos,After 140 000 miles we decided to replace my...
1379,Pos,The Ford Focus is a great little record setti...
1380,Pos,I needed a new car because my hyundai excel 9...


In [4]:
df['Sentiment'][df['Sentiment']=='Neg']=0
df['Sentiment'][df['Sentiment']=='Pos']=1

In [5]:
df.dtypes

Sentiment    object
Review       object
dtype: object

In [6]:
df['Sentiment']=df['Sentiment'].astype('int')

In [7]:
df

Unnamed: 0,Sentiment,Review
0,0,In 1992 we bought a new Taurus and we really ...
1,0,The last business trip I drove to San Franci...
2,0,My husband and I purchased a 1990 Ford F250 a...
3,0,I feel I have a thorough opinion of this truc...
4,0,AS a mother of 3 all of whom are still in ca...
...,...,...
1377,1,In June we bought the Sony Limited Edition Fo...
1378,1,After 140 000 miles we decided to replace my...
1379,1,The Ford Focus is a great little record setti...
1380,1,I needed a new car because my hyundai excel 9...


In [8]:
df.isnull().any()

Sentiment    False
Review       False
dtype: bool

In [9]:
df.dtypes

Sentiment     int32
Review       object
dtype: object

In [10]:
df.drop_duplicates(inplace=True)
df.shape

(1382, 2)

In [11]:
#Preprocessing and Cleaning 
#Removes punctuation and stopwords, converts to lowercase and Porter Stemming

ps = PorterStemmer()   
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
# Output that shows removal of punctuation and stemming

corpus

['bought new tauru realli love decid tri new tauru care style newer version bought anyway like new car half much like one thee dash much deep take lot room find seat comfort way side stick strip protect card dent drive nice good pick see hood driver seat judg park difficult small ga tank would buy tauru would rather back think style nice mistak chang style less month dead batteri flat tire',
 'last busi trip drove san francisco went hertz rental got ford tauru think look comfort profession found seat uncomfort well passeng seat comfort may import fuel usag fine car get us problem uncomfort ride us though hard fit car weigh pound usual quit comfort car tauru seem especi uncomfort anyon think rent car long trip would suggest ford tauru list sure get larg trunk allow us pack lot materi need busi end trip drive breez everyth work right complaint standard make car except comfort area import comfort long trip uncomfort driver distract made trip seem unnecessarili longer felt make trip lot sa

In [21]:
#Bag of Words Vector

bow1=CountVectorizer(ngram_range=(1, 1)).fit_transform(corpus).toarray()
print(bow1)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 2]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [22]:
#Test train split 80:20

X_train, X_test, y_train, y_test = train_test_split(bow1,df['Sentiment'], test_size = 0.2, random_state = 0)

In [23]:
#Test train split 80:20

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1105, 9855)
(277, 9855)
(1105,)
(277,)


In [24]:
#Naive Bayes classifier

mnclassifier = MultinomialNB()
mnclassifier.fit(X_train, y_train)

y_pred=mnclassifier.predict(X_test)


#print(cross_val_score(mnclassifier, bow1, df['Sentiment'], cv=10, scoring ='accuracy').mean())

score = accuracy_score(y_test, y_pred)
print(score )

0.7509025270758123


In [25]:
#confusion matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[104  46]
 [ 23 104]]


# TASK 2

#### For task 2, i have decided to use SVM classifier instead of Naive Bayes. In this use case, SVM suits better than Naive Bayes due to a variety of reasons described below. 


1. Naive Bayes ignores the correlation between features as it treats the features as independent. Whereas SVM looks at the interaction between them to a certain degree.

2. Because of point 1 above, SVM is a safe bet as it is not prone to catastrophic failures like Naive Bayes.

3. Naive Bayes is a generalized algorithm which is better when we want to classify a small corpus of data having a relatively small amount of input features, you don’t expect the inputs to be meaningfully correlated. Because our corpus is a very large set, SVM will do a better job at classification.

(go to conclusion)

https://medium.com/analytics-vidhya/na%C3%AFve-bayes-vs-svm-for-text-classification-c63478229c33

(4.3 section of below pdf illustrates my point)

https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf

I will use Stochastic gradient descent technique using package SGDClassifier with loss=hinge which is a SVM classifier. I use this instead of standard SVM as stochastic gradient descent works better for large datasets as it can treat the data in batches.

(go to conclusion)

https://towardsdatascience.com/using-stochastic-gradient-descent-to-train-linear-classifiers-c80f6aeaff76

In [26]:
#Used SGDClassifier

from sklearn.linear_model import SGDClassifier

# used loss=hinge for SVM classification and did some hyperparameter tuning such that it gives me best accuracy

sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

score = accuracy_score(y_test, y_pred)
print(score)

0.7725631768953068


In [27]:
#new confusion matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[117  33]
 [ 30  97]]


#### We can see that there is approximately a 2.2% increase in accuracy by using SVM classification with stochastic gradient descent.

1. This improvement is due to the fact that SVM is accounting for the correlation between features of the corpus which is bound to exist as it is a very large corpus.

2. On the positive improvements, the number of True negatives increased from 104 to 117, and False positive decreased from 46 to 33. On the other hand, True positive decreased from 104 to 97 and False negative increased from 23 to 30. 

3. This overall is a good thing, as we can see that Naive Bayes had a bias towards giving more False positives than false negatives (46 FP, 23 FN) . With SVM, the number of False positive and false negative is more even so we have minimised bias in the prediction (33 FP, 30 FN).