In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import gc

In [2]:
gc.get_threshold()

(700, 10, 10)

In [3]:
gc.set_threshold(100,4,4)

In [4]:
book_reviews=pd.read_csv('./data/pre_processed_data.csv')  #loading pickle file

In [5]:
book_reviews.head()  #taking a quick look at the data

Unnamed: 0,asin,overall,verified,reviewText,reviewTime
0,0007444117,1,False,I was a huge fan when the Divergent series sta...,2016-09-03
1,0007177437,4,True,Excellent read.,2015-09-16
2,0007327064,5,True,From the very beginning.. I have loved Odd Tho...,2016-07-17
3,0030352401,3,False,"A good, easy, light-hearted read--however ther...",2016-03-02
4,000755236X,5,True,"As a fan of Daniel Silva's books, especially ...",2017-03-07


In [6]:
book_reviews.drop_duplicates(inplace=True) #drop any duplicate row

In [7]:
#Find the number of reviews for each asin number and then sort them in descending order
review_counts=book_reviews.groupby(by='asin').count()
review_counts.sort_values(by='overall',ascending=False,inplace=True)

In [8]:
#selecting the most reviews books for the period
most_reviewed=review_counts[['overall']].head(300).copy(deep=True)

In [9]:
#review_counts is a large dataframe. deleting it and then triggering garbage collection.
del review_counts
gc.collect()

22

---

In [10]:
#creating a list of most reviewed books
most_reviewed=most_reviewed.index.to_list()

In [11]:
most_reviewed

['038568231X',
 '0312577222',
 '1683247353',
 '0553418025',
 '1503943372',
 '1410472922',
 '0007548672',
 '1503934713',
 '0099911701',
 '0751565350',
 '1503935310',
 '0062409883',
 '1101946342',
 '030788743X',
 '0147520762',
 '0091955106',
 '0143109464',
 '0385539436',
 'B00SB123IM',
 '1503953386',
 '0099740915',
 'B017WJ5PR4',
 '0316228532',
 '1250158060',
 '0545582881',
 'B017V4IPPO',
 '1503950255',
 '1477848665',
 '1910751774',
 '0091944244',
 '1410493547',
 '081298840X',
 '0143125478',
 '0804192898',
 '0451412656',
 '0008220565',
 '1491598778',
 '0345544978',
 '0385541198',
 '1477829407',
 '0340979496',
 '0385543026',
 'B000X1MX7E',
 '0375434542',
 '0529120887',
 '0425284689',
 '1542046599',
 '0802407692',
 '1503936864',
 '0399167064',
 '0984502203',
 '0099664313',
 '1455541516',
 '0375969020',
 '0439023521',
 '0349407762',
 '0316225940',
 '1517153158',
 '1477825576',
 '8184776217',
 '0525492127',
 '1455568872',
 '0439249546',
 '0141378247',
 '1477828737',
 '000755236X',
 '14472879

In [12]:
#Let's check how unbalanced the data is
book_reviews[book_reviews['asin'].isin(most_reviewed)].overall.value_counts(normalize=True)

5    0.658635
4    0.191666
3    0.077627
1    0.036180
2    0.035889
0    0.000003
Name: overall, dtype: float64

It appears very unbalanced

In [13]:
#creating multi-index pandas dataframe showing the number of reviews for each rating (1 to 5) for 'asin' numbers in the most_reviewed list
rating_dist=pd.DataFrame(book_reviews[book_reviews['asin'].isin(most_reviewed)].groupby(by='asin')['overall'].value_counts()).rename(columns={'overall':'count'})
rating_dist

Unnamed: 0_level_0,Unnamed: 1_level_0,count
asin,overall,Unnamed: 2_level_1
000711835X,5,888
000711835X,4,85
000711835X,3,44
000711835X,1,32
000711835X,2,24
...,...,...
B01CIYURJE,5,352
B01CIYURJE,4,113
B01CIYURJE,3,90
B01CIYURJE,2,52


In [14]:
book_ratings=pd.DataFrame()
for i in most_reviewed:
    weighted_sum=0
    count=0
    for j in range(1,6):
        try:
            weighted_sum+=rating_dist.loc[i,j]*j  #multiplying each rating by number of times it was rated as such
            count+=rating_dist.loc[i,j]   #total count
        except:
            pass
    if (weighted_sum/count)[0]<3.5:   #calculating weighted average
        print([i,weighted_sum/count])


['0062409883', count    3.453767
Name: (0062409883, 1), dtype: float64]
['034551162X', count    2.682788
Name: (034551162X, 1), dtype: float64]


In [15]:
book_reviews[(book_reviews['asin']=='034551162X')|(book_reviews['asin']=='0062409883')]

Unnamed: 0,asin,overall,verified,reviewText,reviewTime
138779,0062409883,4,False,"This book was a fascinating read. I re-read ""T...",2015-07-27
138859,0062409883,4,True,An important read.,2015-08-08
138884,0062409883,3,True,Enjoyable. Not up to mockingbird standards but...,2015-08-14
138892,0062409883,1,False,Save your money! This work was never meant to ...,2015-11-29
138914,0062409883,5,False,We are all so smug in our hatred of racism but...,2015-08-22
...,...,...,...,...,...
722002,034551162X,3,True,Just an ok read. A bit of a struggle to get th...,2016-05-20
722156,034551162X,2,False,"First off, let me say that I have read a coupl...",2015-09-18
722268,034551162X,4,True,It's been awhile since I read a Star Wars nove...,2015-09-18
3909376,0062409883,5,True,If you loved To Kill a Mockingbird this is a m...,2018-04-24


In [16]:
#We can also try to guess who the reviewer is based on writing style.

---

### Classification Task

In [17]:
book_reviews

Unnamed: 0,asin,overall,verified,reviewText,reviewTime
0,0007444117,1,False,I was a huge fan when the Divergent series sta...,2016-09-03
1,0007177437,4,True,Excellent read.,2015-09-16
2,0007327064,5,True,From the very beginning.. I have loved Odd Tho...,2016-07-17
3,0030352401,3,False,"A good, easy, light-hearted read--however ther...",2016-03-02
4,000755236X,5,True,"As a fan of Daniel Silva's books, especially ...",2017-03-07
...,...,...,...,...,...
5538557,1547843152,2,True,Returned...disappointed in content.,2018-06-27
5538558,B01E6OZH5S,5,True,good story book,2017-02-15
5538559,B00QSFOJ9E,5,True,Great book,2015-09-01
5538560,1537644513,4,True,Good read - I was hoping for the bad guys! Not...,2016-12-11


In [18]:
gc.collect()

22

In [19]:
reviews_ratings=book_reviews[['overall','reviewText']].copy(deep=True)    #selecting ratings and review text
reviews_ratings.to_csv('./data/reviews_and_ratings.csv',index=0)

In [20]:
#free-up memory by removing dataframes from memory and reloading necessary data from saved pickle file
del book_reviews
del reviews_ratings
gc.collect()

22

In [21]:
reviews_ratings=pd.read_csv('./data/reviews_and_ratings.csv')

In [22]:
reviews_ratings

Unnamed: 0,overall,reviewText
0,1,I was a huge fan when the Divergent series sta...
1,4,Excellent read.
2,5,From the very beginning.. I have loved Odd Tho...
3,3,"A good, easy, light-hearted read--however ther..."
4,5,"As a fan of Daniel Silva's books, especially ..."
...,...,...
5535440,2,Returned...disappointed in content.
5535441,5,good story book
5535442,5,Great book
5535443,4,Good read - I was hoping for the bad guys! Not...
