# Filter and alter reviews dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
reviews = pd.read_csv('yelp_review.csv')

In [3]:
business = pd.read_csv('filteredbusiness.csv')

In [4]:
#View structure of yelp_reviews csv file
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [5]:
#Verify length
len(reviews)

5261668

In [6]:
#Verify length
len(business)

10598

## Filter out only the reviews that match the businesses we are using.
Reviews will only be kept if the business_id matches a business that we are including in our analysis.

In [7]:
#grab all unique IDs from business dataset
ids = business['business_id'].unique()

In [8]:
type(ids)

numpy.ndarray

In [9]:
#Verify all business ids are in 1d array
ids.shape

(10598,)

In [10]:
#create new dataframe only with reviews for businesses we are analyzing
reviews1 = reviews[reviews['business_id'].isin(ids)]

In [11]:
#verify that it has been filtered
print(len(reviews))
print(len(reviews1))

5261668
920258


In [12]:
#verify that the number of business ids matches number of businesses we are analyzing
reviews1['business_id'].nunique()

10598

In [13]:
#save filtered reviews to new csv file
reviews1.to_csv('filteredreviews.csv')

In [14]:
reviews1.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
93,BF0ANB54sc_f-3_howQBCg,ssuXFjkH4neiBgwv-oN4IA,JlNeaOymdVbE6_bubqjohg,1,2014-08-09,We always go to the chevo's in chandler which ...,3,0,0
94,DbLUpPT61ykLTakknCF9CQ,ssuXFjkH4neiBgwv-oN4IA,0Rni7ocMC_Lg2UH0lDeKMQ,1,2014-08-09,"This place is always so dirty and grimy, been ...",6,0,0
355,z_mVLygzPn8uHp63SSCErw,MzEnYCyZlRYQRISNMXTWIg,S-oLPRdhlyL5HAknBKTUcQ,4,2017-11-30,Holy portion sizes! You get a lot of bang for ...,0,0,0
357,xatycgntu_F_Ioyny3iflw,vaXJ7-xLrnD6FAEhUqYKwQ,iIjVO7cLD1UEmIO7G05Ujw,4,2016-06-11,Flavor was actually pretty good. Not used to e...,0,0,0
358,Z7U7MMef6Tbj_ZbSFzLRUw,vaXJ7-xLrnD6FAEhUqYKwQ,1JF9TbJ2d5hH8xsQvvklHg,5,2016-06-18,This is place very great flavor. Server was on...,1,0,0


# Add 'is_open' attribute to reviews
Match business_id, then add 'is_open' attribute using a join

In [15]:
business2 = business[['business_id', 'is_open']]

In [16]:
business2

Unnamed: 0,business_id,is_open
0,rDMptJYWtnMhpQu_rRXHng,1
1,1WBkAuQg81kokZIPMpn9Zg,1
2,iPa__LOhse-hobC2Xmp-Kw,1
3,kKx8iCJkomVQBdWHnmmOiA,1
4,YhV93k9uiMdr3FlV4FHjwA,1
...,...,...
10593,AEYNihHmGIjmUciRFo3qwA,1
10594,vGDhK2Lc4Np5iZYZ7FG0QA,0
10595,5zva2MTtB5IX6TaoVLL-NA,1
10596,Gr-2oBg4XyduSKbvnE-i9g,1


In [17]:
#Join based on business id
reviews2 = reviews1.set_index('business_id').join(business2.set_index('business_id'))

In [18]:
#verify
reviews2.head()

Unnamed: 0_level_0,review_id,user_id,stars,date,text,useful,funny,cool,is_open
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
--g-a85VwrdZJNf0R95GcQ,YZNj0R3_BrwRwhYECtwx_Q,etKVjwicBlzBLydl9t-cew,5,2014-05-31,"My Daughter, grandsons and I were looking for ...",0,0,0,0
--g-a85VwrdZJNf0R95GcQ,_HpIujp_Yvbyk8U6s8nW3Q,8S4b4Adn1laE--nVsu8Udw,5,2013-12-05,"Great food, and great prices. The family and ...",1,0,1,0
--g-a85VwrdZJNf0R95GcQ,eXPxJG3vHrVmL2D6Djio6w,bcmcnSWyDY5FephDIdZa5w,5,2014-01-13,Great Food! Good Service! Very friendly family...,0,0,0,0
--g-a85VwrdZJNf0R95GcQ,GwymU3qC1Ppatuwm6TE23w,1YRDt0Bqmxsp6DHIm0SuwQ,4,2014-09-13,This probably one of the better Middle Eastern...,0,0,0,0
--g-a85VwrdZJNf0R95GcQ,4PnhRL8g53tTxdTMh0qLkg,IBkNDCG9zKDbzUdbFFKpxQ,5,2014-04-09,"What a wonderful surprise, this restaurant was...",0,0,0,0


In [19]:
len(reviews2)

920258

In [20]:
len(reviews1)

920258

In [21]:
reviews2.to_csv('filteredreviews_joined.csv')

KeyboardInterrupt: 