In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')
df.head()
df = df.iloc[0:50000,:]
df.shape

(50000, 6)

In [3]:
df=df[['Reviews','Rating']]
df=df.dropna()
df.head()

Unnamed: 0,Reviews,Rating
0,I feel so LUCKY to have found this used (phone...,5
1,"nice phone, nice up grade from my pantach revu...",4
2,Very pleased,5
3,It works good but it goes slow sometimes but i...,4
4,Great phone to replace my lost phone. The only...,4


In [4]:
df=df[df['Rating']!=3]
df=df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46815 entries, 0 to 46814
Data columns (total 2 columns):
Reviews    46815 non-null object
Rating     46815 non-null int64
dtypes: int64(1), object(1)
memory usage: 731.6+ KB


In [5]:
df['sentiment']=np.where(df['Rating'] > 3, 1, 0)
df.head()

Unnamed: 0,Reviews,Rating,sentiment
0,I feel so LUCKY to have found this used (phone...,5,1
1,"nice phone, nice up grade from my pantach revu...",4,1
2,Very pleased,5,1
3,It works good but it goes slow sometimes but i...,4,1
4,Great phone to replace my lost phone. The only...,4,1


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['sentiment'], test_size=0.2, random_state=0)

In [8]:
import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

In [14]:
def cleanText(raw_text, remove_stopwords=True, stemming=False, split_text=False):
    
    letters_only = re.sub("[^a-zA-Z]", " ", raw_text)  
    words = letters_only.lower().split()  
    
    if remove_stopwords: 
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True: 
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
        
    if split_text==True: 
        return (words)
    
    return( " ".join(words))

In [15]:
X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanText(d))
    
for d in X_test:
    X_test_cleaned.append(cleanText(d))

In [16]:
X_train_cleaned[10]

'battery good become low fast'

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
countVect = CountVectorizer() 
X_train_countVect = countVect.fit_transform(X_train_cleaned)
print("Number of features : %d \n" %len(countVect.get_feature_names())) #6378 
print("Show some feature names : \n", countVect.get_feature_names()[::1000])

Number of features : 11718 

Show some feature names : 
 ['aa', 'belgium', 'comparisons', 'disinfectant', 'fixes', 'husbands', 'located', 'oneit', 'punctures', 'seamless', 'sunlight', 'untethered']


In [19]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_countVect, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline

In [21]:
def modelEvaluation(predictions):
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nAUC score : {:.4f}".format(roc_auc_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [22]:
predictions = mnb.predict(countVect.transform(X_test_cleaned))
modelEvaluation(predictions)


Accuracy on validation set: 0.9428

AUC score : 0.9207

Classification report : 
               precision    recall  f1-score   support

           0       0.92      0.87      0.89      2587
           1       0.95      0.97      0.96      6776

   micro avg       0.94      0.94      0.94      9363
   macro avg       0.93      0.92      0.93      9363
weighted avg       0.94      0.94      0.94      9363


Confusion Matrix : 
 [[2254  333]
 [ 203 6573]]


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
tfidf = TfidfVectorizer(min_df=5)
X_train_tfidf = tfidf.fit_transform(X_train)

In [27]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

print("Number of features : %d \n" %len(tfidf.get_feature_names())) 
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

Number of features : 5190 

Show some feature names : 
 ['00', 'colors', 'gem', 'moving', 'screensaver', 'warm']


In [28]:
predictions = lr.predict(tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)


Accuracy on validation set: 0.9401

AUC score : 0.9034

Classification report : 
               precision    recall  f1-score   support

           0       0.96      0.82      0.88      2587
           1       0.94      0.99      0.96      6776

   micro avg       0.94      0.94      0.94      9363
   macro avg       0.95      0.90      0.92      9363
weighted avg       0.94      0.94      0.94      9363


Confusion Matrix : 
 [[2125  462]
 [  99 6677]]
