## Workbook seeks to create a list of users who usually leave ratings less than 2.5 stars

In [None]:
from pathlib import Path
import pandas as pd

In [2]:
#create empty dataframe
column_names = {"user_id":'',"stars":int(),"text":'',"business_id":''}
review_df = pd.DataFrame(column_names, index = [])
review_df["stars"] = review_df["stars"].astype("int8")
review_df.dtypes

user_id        object
stars            int8
text           object
business_id    object
dtype: object

In [3]:
#Load data from JSON file using nrows and chunksize to limit amount of data in memory at any particular time
review_path = Path(r"D:\yelp_data\yelp_academic_dataset_review.json")
review_holder = pd.read_json(review_path,lines = True, orient ="records", nrows = 4000000, chunksize=10000)
for i in review_holder:
    i = i[["user_id","stars","text","business_id"]]
    review_df = review_df.append(i, ignore_index = True)
    review_df["stars"] = review_df["stars"].astype("int8")
print(review_df.shape)

(4000000, 4)


In [4]:
review_df.head()

Unnamed: 0,user_id,stars,text,business_id
0,ak0TdVmGKo4pwqdJSTLwWw,4,Apparently Prides Osteria had a rough summer a...,buF9druCkbuXLX526sGELQ
1,YoVfDbnISlW0f7abNQACIg,4,This store is pretty good. Not as great as Wal...,RA4V8pr014UyUbDvI-LW2A
2,eC5evKn1TWDyHCyQAwguUw,5,I called WVM on the recommendation of a couple...,_sS2LBIGNT5NQb6PD1Vtjw
3,SFQ1jcnGguO0LYWnbbftAA,2,I've stayed at many Marriott and Renaissance M...,0AzLzHfOJgL7ROwhdww2ew
4,0kA0PAJ8QFMeveQWHFqz2A,4,The food is always great here. The service fro...,8zehGz9jnxPqXtOc7KaJxA


In [5]:
review_df.dtypes

user_id        object
stars            int8
text           object
business_id    object
dtype: object

In [6]:
#Explore the memory usage of the data frame
review_df.memory_usage()

Index               128
user_id        32000000
stars           4000000
text           32000000
business_id    32000000
dtype: int64

In [7]:
review_df.memory_usage().sum()

100000128

In [8]:
#review number of unique users, find those who have 5 or more reviews
user_summary = review_df.groupby("user_id").agg({'stars':['sum','count']}).droplevel(level=0, axis =1)
user_summary= user_summary.loc[user_summary['count'] > 4]
user_summary["average_score"] = user_summary["sum"]/user_summary["count"]
print(user_summary.shape)
user_summary.head()

(164351, 3)


Unnamed: 0_level_0,sum,count,average_score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
--1ZsAsSdoqgoZZTYjsuiw,34.0,8,4.25
--1orhUoGFSdHXsoxqQc8g,28.0,7,4.0
--2PnhMMH7EYoY3wywOvgQ,29.0,6,4.833333
--2vR0DIsmQ6WfcSzKWigw,69.0,16,4.3125
--3HptO9LVPn1yTS973M_Q,25.0,7,3.571429


In [9]:
#modify main DF to get those with more than 5 reviews
review_df = review_df.join(user_summary["average_score"], how = "inner", on = "user_id")
review_df.shape

(2151428, 5)

In [10]:
#look at low average score reviewers
low_reviewers = user_summary.loc[user_summary["average_score"] < 2.5]
print(low_reviewers.shape)
low_reviewers.head()

(9042, 3)


Unnamed: 0_level_0,sum,count,average_score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
--xfUH1hLFKLmOBt7M6iOA,20.0,9,2.222222
-0PvPYV-Eph7oRQLgWCfGQ,24.0,10,2.4
-1XzBP763xlH0-D-cyyTgw,14.0,6,2.333333
-1hn92IDutY6EXdlJW1urQ,22.0,11,2.0
-1sghQnxvMPUlmIBIosKOQ,9.0,5,1.8
