In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.head()
df = df.iloc[0:50000,:]
df.shape

(50000, 6)

In [8]:
df=df[['Reviews','Rating']]
df=df.dropna()
df.head()

In [5]:
#Removing the neutral reviews
df=df[df['Rating']!=3]

In [6]:
df=df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850 entries, 0 to 1849
Data columns (total 2 columns):
Reviews    1850 non-null object
Rating     1850 non-null int64
dtypes: int64(1), object(1)
memory usage: 29.0+ KB


In [24]:
df['sentiment']=np.where(df['Rating'] > 3, 1, 0)
df.head()

Unnamed: 0,Reviews,Rating,sentiment
0,I feel so LUCKY to have found this used (phone...,5,1
1,"nice phone, nice up grade from my pantach revu...",4,1
2,Very pleased,5,1
3,It works good but it goes slow sometimes but i...,4,1
4,Great phone to replace my lost phone. The only...,4,1


In [8]:
#importing libraries needed for NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [9]:
#The preprocessing Step and creation of BagOfWords
texts=[]
for i in df.Reviews:
    text=re.sub("[^a-zA-Z]"," ",i)
    text=text.lower()
    text=nltk.word_tokenize(text)
    lemma=nltk.WordNetLemmatizer()
    text=[lemma.lemmatize(word) for word in text if not word in set(stopwords.words('english'))]
    text=" ".join(text)
    texts.append(text)
    
texts

['feel lucky found used phone u used hard phone line someone upgraded sold one son liked old one finally fell apart year want upgrade thank seller really appreciate honesty said used phone recommend seller highly would',
 'nice phone nice grade pantach revue clean set easy set never android phone fantastic say least perfect size surfing social medium great phone samsung',
 'pleased',
 'work good go slow sometimes good phone love',
 'great phone replace lost phone thing volume button work still go setting adjust job eligible upgrade phone thaanks',
 'already phone problem know stated used dang state charge wish would read comment would purchased item cracked side damaged good trying charge another way work requesting money back get money back signed unhappy customer',
 'charging port loose got soldered needed new battery well later including cost purchase usable phone phone sold state',
 'phone look good stay charged buy new battery still stay charged long trashed money lost never buy p

In [10]:
#Cleaning BagOfWords
from sklearn.feature_extraction.text import CountVectorizer

max_f=1000
CntVec = CountVectorizer(max_features = max_f)
sparse_mat = CntVec.fit_transform(texts).toarray()
words = CntVec.get_feature_names()
words

['ability',
 'able',
 'absolutely',
 'access',
 'accessory',
 'acer',
 'across',
 'activate',
 'activated',
 'actual',
 'actually',
 'add',
 'added',
 'additional',
 'advertised',
 'affect',
 'affordable',
 'afraid',
 'ago',
 'alarm',
 'alcatel',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'alot',
 'already',
 'also',
 'although',
 'always',
 'amazing',
 'amazon',
 'amount',
 'android',
 'angle',
 'another',
 'answer',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'apn',
 'app',
 'apple',
 'application',
 'apps',
 'area',
 'around',
 'arrived',
 'asha',
 'ask',
 'asked',
 'att',
 'audio',
 'automatically',
 'available',
 'average',
 'away',
 'awesome',
 'awful',
 'back',
 'bad',
 'band',
 'bar',
 'barely',
 'based',
 'basic',
 'basically',
 'battery',
 'beat',
 'beautiful',
 'beep',
 'began',
 'believe',
 'benefit',
 'best',
 'better',
 'big',
 'bigger',
 'bill',
 'bit',
 'black',
 'blue',
 'bluetooth',
 'bother',
 'bottom',
 'bought',
 'box',
 'brand',
 'break',
 'bright',


In [15]:
#creating the feature matrix and target vector
X =sparse_mat
y = df.iloc[:,1].values

In [16]:
#splitting the training and testing data
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2,random_state=1)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
classifier = MultinomialNB()
classifier.fit(xtrain, ytrain)

y_pred = classifier.predict(xtest)
print("Accuracy_score: ", accuracy_score(ytest,y_pred))

Accuracy_score:  0.7162162162162162


In [20]:
#fitting data and predicting using RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=30)
model.fit(xtrain,ytrain)

y_pred = model.predict(xtest)

from sklearn.metrics import accuracy_score
print("Accuracy_score: ", accuracy_score(ytest,y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, y_pred)
print(cm)

Accuracy_score:  0.7486486486486487
[[ 66   2   1  18]
 [ 14  12   2   5]
 [  7   3  21  20]
 [ 10   0  11 178]]


In [123]:
#for better accuracy trying LogisyicRegression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(xtrain,ytrain)

y_pred_lr = lr.predict(xtest)
print("Accuracy_score: ",accuracy_score(ytest,y_pred_lr))



Accuracy_score:  0.7764833378488215


In [124]:
#Fitting SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(xtrain, ytrain)

y_pred1 = classifier.predict(xtest)
print("Accuracy_score: ",accuracy_score(ytest,y_pred1))

Accuracy_score:  0.7946356001083718


In [110]:
#Fitting DTC o the training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 1)
classifier.fit(xtrain, ytrain)

y_pred2 = classifier.predict(xtest)
print("Accuracy_score: ",accuracy_score(ytest,y_pred2)) 

Accuracy_score:  0.6864864864864865
