# Amazon book review

### 1) Import libraries


In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re 
import nltk
import string
import pickle
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

### 2) Data Loading

In [10]:
# Read the dataset from csv file
df = pd.read_csv(r'Data/Amazon_reviews_processed.csv')
df.head()

Unnamed: 0,reviewText,Score
0,book first bookmobile book bought school book ...,1
1,normally buy mystery novels like however time ...,1
2,kind book normally read although try limit cer...,1
3,bought book loved cover try read civil war rom...,0
4,book thoroughly enjoyed beginning end story li...,1


### 3) Feature Extraction

In [11]:
# Determine data and target
X = df['reviewText']
y = df['Score']
y

0        1
1        1
2        1
3        0
4        1
        ..
11127    0
11128    0
11129    1
11130    1
11131    1
Name: Score, Length: 11132, dtype: int64

In [12]:
# Splitting the dataset into the Training, validation and Test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [13]:
# I will use TF-IDF method to extract the text features.

# Use TF-IDF

tf_vec = TfidfVectorizer(tokenizer=None, stop_words=None, max_df=0.75, max_features=2000, lowercase=False,
                         ngram_range=(1,2), use_idf=False, sublinear_tf=True, min_df=5, norm='l2',
                         encoding='latin-1')


train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
test_features = tf_vec.transform(X_test)


print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of train_vectors:',train_features.shape)
print('Shape of test_vectors:',test_features.shape)

Shape of X_train: (8905,)
Shape of X_test: (2227,)
Shape of train_vectors: (8905, 2000)
Shape of test_vectors: (2227, 2000)


In [14]:
## Vectorization of data
## Vectorize the data using Bag of words (BOW)

tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
tf_vec = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
test_features = tf_vec.transform(X_test)



In [15]:
#  save tfidf vectorizer
with open('Vectors/train_vector.pkl', 'wb') as handle:
    pickle.dump(train_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('Vectors/test_vector.pkl', 'wb') as handle:
    pickle.dump(test_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
#  save Target Varab

with open('Vectors/train_label.pkl', 'wb') as handle:
    pickle.dump(y_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('Vectors/test_label.pkl', 'wb') as handle:
    pickle.dump(y_test, handle, protocol=pickle.HIGHEST_PROTOCOL)    