# Filter/alter business dataset
The purpose of this script is to filter out businesses to include only restautrants and limit data to only one metro area, based on whichever has the most data.

In [1]:
import numpy as np
import pandas as pd

In [2]:
business = pd.read_csv('yelp_business.csv')

In [3]:
#preview file
business.head()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.1049,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",,"""6025 N 27th Ave, Ste 1""",Phoenix,AZ,85017,33.524903,-112.11531,1.5,18,1,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",,"""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,85282,33.383147,-111.964725,3.0,9,0,Sporting Goods;Shopping
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...


In [4]:
#see which areas have the most data
business['state'].value_counts()

AZ     52214
NV     33086
ON     30208
NC     12956
OH     12609
       ...  
WA         1
STG        1
B          1
30         1
PKN        1
Name: state, Length: 67, dtype: int64

In [5]:
len(business)

174567

In [6]:
len(business[business['state']=='AZ'])

52214

In [7]:
#only include Arizona/metro Phoenix
business = business[business['state']=='AZ']

In [8]:
#preview changes
business.head()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",,"""6025 N 27th Ave, Ste 1""",Phoenix,AZ,85017,33.524903,-112.11531,1.5,18,1,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",,"""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,85282,33.383147,-111.964725,3.0,9,0,Sporting Goods;Shopping
11,Y0eMNa5C-YU1RQOZf9XvVA,"""CubeSmart Self Storage""",,"""2414 South Gilbert Road""",Chandler,AZ,85286,33.27172,-111.791257,5.0,23,1,Local Services;Self Storage
16,IQSlT5jGE6CCDhSG0zG3xg,"""T & Y Nail Spa""",,"""8411 W Thunderbird Rd, Unit 101""",Peoria,AZ,85381,33.608654,-112.240012,3.0,20,1,Beauty & Spas;Nail Salons


In [9]:
#count number of restaurants to make sure there will be sufficient data
len(business[business['categories'].str.contains("Restaurants")])

10598

In [10]:
#filter out non-restaurants
business = business[business['categories'].str.contains("Restaurants")]

In [11]:
#verify changes
len(business)

10598

In [12]:
#drop unneeded columns
business.drop(columns = ["neighborhood"], inplace = True)

In [13]:
#preview changes
business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
45,rDMptJYWtnMhpQu_rRXHng,"""McDonald's""","""719 E Thunderbird Rd""",Phoenix,AZ,85022,33.60707,-112.064382,1.0,10,1,Fast Food;Burgers;Restaurants
46,1WBkAuQg81kokZIPMpn9Zg,"""Charr An American Burger Bar""","""777 E Thunderbird Rd, Ste 107""",Phoenix,AZ,85022,33.60731,-112.063404,3.0,232,1,Burgers;Restaurants
72,iPa__LOhse-hobC2Xmp-Kw,"""McDonald's""","""1635 E Camelback Rd""",Phoenix,AZ,85016,33.508765,-112.04624,3.0,34,1,Restaurants;Burgers;Fast Food
80,kKx8iCJkomVQBdWHnmmOiA,"""Little Caesars Pizza""","""10720 E Southern Ave""",Mesa,AZ,85209,33.394877,-111.600194,2.5,4,1,Restaurants;Pizza
88,YhV93k9uiMdr3FlV4FHjwA,"""Caviness Studio""","""""",Phoenix,AZ,85001,33.449967,-112.070223,5.0,4,1,Marketing;Men's Clothing;Restaurants;Graphic D...


In [14]:
#verify that there is a sufficient number of open/closed restaurants for comparison
business['is_open'].value_counts()

1    7367
0    3231
Name: is_open, dtype: int64

In [15]:
#save file
business.to_csv('filteredbusiness.csv')