In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('Amazon_Unlocked_Mobile.zip', compression='zip')

df = df.sample(frac=0.1, random_state=10)

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [2]:
# cleaning
df = df.dropna()

# removing neutral reviews
neutral = df['Rating'] == 3
df = df[~neutral]

In [3]:
# creating tag column
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(6)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1


In [4]:
from sklearn.model_selection import train_test_split

X_train, y_train, X_test, y_test = train_test_split(df['Reviews'], df['Positively Rated'], random_state = 0)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(X_train)

In [6]:
# Examples of some words
vect.get_feature_names()[::10000]

['00', 'lands']

In [7]:
len(vect.get_feature_names())

19601

In [8]:
X_train_vectorized = vect.transform(X_train)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df = 5)
vect.fit(X_train)

In [10]:
# tfidf returns fewer features
len(vect.get_feature_names())

5442

In [11]:
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

# most features but best accuracy because leads to differentiation between "not bad" and "bad"
len(vect.get_feature_names())

29072