In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import re

In [27]:
df = pd.read_csv('amazon_review_ipsa.csv')

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_date,review,review_category
0,35311,UK,4652511,R2PF9XCB2AOY7C,B00MNYTWMI,174964168,Midnight Castle – A Free Hidden Object Mystery...,Mobile_Apps,5,0,1,N,Y,2014-09-16,"Addictive game Very good game, the graphics ar...",1
1,44590,UK,8778070,R2OVRHVTCC7U18,B00I8Q77Y0,370420344,Flappy Wings (not Flappy Bird),Mobile_Apps,2,1,2,N,Y,2014-02-16,Where to start? I came on here because I wante...,0
2,90295,UK,8476187,RRPSNSVU2RNE4,B008PEHK6U,173475712,Man of Steel [DVD],Video DVD,3,1,5,N,N,2013-12-28,Really Disappointed in this Film I have to say...,0
3,38798,UK,4329864,R1WK44Y81XH276,B00NWLIW0E,13526192,Goat Simulator,Mobile_Apps,1,3,5,N,N,2014-12-26,"Rubbish game When I tried to open it, it said ...",0
4,43364,UK,8080547,R2UL591NPU40SN,B00HRC40T6,887316735,Shadow Blade,Mobile_Apps,1,0,0,N,Y,2015-06-29,One Star Come on,0


In [29]:
df.dropna(inplace=True)

In [30]:
x = df['review']
y = df['review_category']

In [6]:
x

0       Addictive game Very good game, the graphics ar...
1       Where to start? I came on here because I wante...
2       Really Disappointed in this Film I have to say...
3       Rubbish game When I tried to open it, it said ...
4                                        One Star Come on
                              ...                        
9995                                    It's great😆
9996    Urrgh This really is appalling.  Somewhere on ...
9997    Quite disappointed After waiting over a year, ...
9998    Rate Despicable minion rush is cool<br />So co...
9999                               Five Stars Great story
Name: review, Length: 9999, dtype: object

In [33]:
x_new = []


for w in x:
    w = re.sub('[^a-zA-Z ]', '', w)
    w = w.lower()
    x_new.append(w)

In [None]:
# performing stemming also here

In [None]:
# stop word removal also

In [34]:
x_new

['addictive game very good game the graphics are brilliant a great game if you like hidden object games',
 'where to start i came on here because i wanted to get flappy bird but when i saw this i thought it was basically the same thing but its even hardermore frustrating and annoyingdoesnt react straight away and i dont get whats with the poo i want flappy bird',
 'really disappointed in this film i have to say i was so looking forward to this remake of superman the trailer for the film made it look fantastic talk about disappointed when i watched it on christmas day there were good points for the film that i didnt feel were really properly realised amy adams as lois lane was one she actually made me like the character which was a first for me i also did like the fact that lois found out who superman was early on in the film i always found it daft that she didnt work it out in the original film some reporter i also thought russell crowe played the part of jor el to perfection the young

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x_new,y, test_size = 0.2, random_state = 123)

## Now Generating Bag of Words

In [10]:
# Using Count Vectorizer
vector = CountVectorizer(stop_words='english').fit(x_train)

In [11]:
vector.get_feature_names()[::30]

['aaaexcellentoldpicturethereissomegrainonthescreenthesoundisverygoodtopssobyetopa',
 'abrilliantbookpraguefataleisprobablymyfavouriteberniegunthertitleofthewholeseriesmanyoftheotherreviewershavementionedthefastpaceandthedarkandoccasionallyexplosiveatmospherecreatediparticularlylikethesarcasticsardonicstylethatkerrgiveshismaincharacterandthoughithoughtthiswasalittleforcedandoverwroughtinsomeoftheearlierbooksitworksperfectlyherelikealloftheseriesthisbookmeldsfactandfictionwonderfullyanditcanbeenjoyedbyitselforasapartofawhole',
 'acethisgameisthebestgamethatyoucanplayonline',
 'addictivegamefabgamereallyaddictivefun',
 'afabgameitsamazingthereisnowordwhyisntitsreallyfunandyoucangetbrcoolstuffonitandthemorecoinsyougetonitthemoreyouplayitandafterafewhoursyougetmorecoinsiwouldsayitwouldbe',
 'agoodcdfromoneofthebestcurrentartistsofagoodcdfromoneofthebestcurrentartistsofthisgenerationthatshowsofhissuperbwritingskills',
 'agreatrepetitiveyawnathononeofthegreatyawnathonsofmyreadinglifeiboughti

In [12]:
len(vector.get_feature_names()[::30])

248

In [13]:
x_train_vector = vector.transform(x_train)
x_train_vector

<7999x7435 sparse matrix of type '<class 'numpy.int64'>'
	with 7997 stored elements in Compressed Sparse Row format>

In [14]:
x_train_vector.toarray()[1]

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
model = LogisticRegression()
model.fit(x_train_vector, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
predictions = model.predict(vector.transform(x_test))
print(confusion_matrix(predictions, y_test))
print(classification_report(predictions, y_test))

[[993 882]
 [  0 125]]
              precision    recall  f1-score   support

           0       1.00      0.53      0.69      1875
           1       0.12      1.00      0.22       125

    accuracy                           0.56      2000
   macro avg       0.56      0.76      0.46      2000
weighted avg       0.95      0.56      0.66      2000



In [17]:
print(model.predict(vector.transform(['The packaging of the product was not up to the level'])))

[0]


In [18]:
print(model.predict(vector.transform(['The product was delivered on time !! Thanks a lot amazon'])))

[0]


In [19]:
print(model.predict(vector.transform(['What a stupid product. Doesn"t even work'])))

[0]


## Now using TF-IDF

In [20]:
# Generating feature vectors using tf-idf vector
vector_tf = TfidfVectorizer(min_df = 6).fit(x_train)
vector_tf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=6, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
x_train_vector = vector_tf.transform(x_train)
model_tf = LogisticRegression()
model_tf.fit(x_train_vector, y_train)
predictions_tf = model_tf.predict(vector_tf.transform(x_test))

In [22]:
print(confusion_matrix(predictions_tf, y_test))
print(classification_report(predictions_tf, y_test))


[[993 932]
 [  0  75]]
              precision    recall  f1-score   support

           0       1.00      0.52      0.68      1925
           1       0.07      1.00      0.14        75

    accuracy                           0.53      2000
   macro avg       0.54      0.76      0.41      2000
weighted avg       0.97      0.53      0.66      2000



In [1]:
# Checking our prediction on new real reviews 

print(model.predict(vector.transform(['product found in bad condition '])))

NameError: name 'model' is not defined

In [2]:
import numpy as np

In [12]:
a = np.arange(40).reshape(4,10)
a.reshape(-1,4,10)

array([[[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
        [30, 31, 32, 33, 34, 35, 36, 37, 38, 39]]])