# Feature Engineering

In this notebook, we generate additional features that will help us come up with a more predictive model.

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [2]:
# Read unprocessed data (old and new datasets already merged)
df_unprocessed = pd.read_pickle('./yelp_dataset_processed/yelp_df_all_final.pkl')
df_reviews = pd.read_json('./yelp_old/yelp_training_set/yelp_training_set_review.json',lines=True)

In [3]:
df_unprocessed.head()

Unnamed: 0,business_id,categories,categories_new,city,coordinates.latitude,coordinates.longitude,display_phone,found,found2,full_address,...,photos,price,rating,review_count,review_count_new,stars,state,transactions,type,url
0,PzOqRohWw7F7YEPBz6AubA,"[Food, Bagels, Delis, Restaurants]","[{'alias': 'bagels', 'title': 'Bagels'}, {'ali...",Glendale,33.7136,-112.2,(623) 825-0355,1,1,"6520 W Happy Valley Rd\nSte 101\nGlendale Az, ...",...,[https://s3-media4.fl.yelpcdn.com/bphoto/efZ-8...,$,3.5,14,77,3.5,AZ,[],business,https://www.yelp.com/biz/hot-bagels-and-deli-g...
1,qarobAbxGSHI7ygf1f7a_Q,"[Sandwiches, Restaurants]","[{'alias': 'sandwiches', 'title': 'Sandwiches'...",Gilbert,33.3787,-111.813,(480) 632-6453,1,1,"891 E Baseline Rd\nSuite 102\nGilbert, AZ 85233",...,[https://s3-media2.fl.yelpcdn.com/bphoto/eTDf8...,$,3.0,10,29,3.5,AZ,[pickup],business,https://www.yelp.com/biz/jersey-mikes-subs-gil...
2,JxVGJ9Nly2FFIs_WpJvkug,"[Pizza, Restaurants]","[{'alias': 'italian', 'title': 'Italian'}, {'a...",Scottsdale,33.6175,-111.926,(480) 321-8800,1,1,"14418 N Scottsdale Rd\nSuite 181\nScottsdale, ...",...,[https://s3-media2.fl.yelpcdn.com/bphoto/1lTHQ...,$$,3.5,55,163,4.0,AZ,[pickup],business,https://www.yelp.com/biz/sauce-pizza-and-wine-...
3,Jj7bcQ6NDfKoz4TXwvYfMg,"[Burgers, Restaurants]","[{'alias': 'burgers', 'title': 'Burgers'}, {'a...",Phoenix,33.567,-112.116,(602) 870-1111,1,1,"8941 N Black Canyon Hwy\nPhoenix, AZ 85021",...,[https://s3-media4.fl.yelpcdn.com/bphoto/gcmpO...,$$,3.5,23,81,4.0,AZ,[],business,https://www.yelp.com/biz/fuddruckers-phoenix-3...
4,4IAzFJ159GEaIGX1-y6Bmw,"[Burgers, Fast Food, Restaurants]","[{'alias': 'hotdogs', 'title': 'Fast Food'}, {...",Scottsdale,33.5819,-111.882,(480) 451-1803,1,1,"9251 E Shea Blvd\nScottsdale, AZ 85258",...,[https://s3-media4.fl.yelpcdn.com/bphoto/ofHjs...,$,2.5,3,16,2.5,AZ,[],business,https://www.yelp.com/biz/mcdonalds-scottsdale-...


In [4]:
df_reviews.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,"{'funny': 0, 'useful': 5, 'cool': 2}"
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,"{'funny': 0, 'useful': 0, 'cool': 0}"
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,"{'funny': 0, 'useful': 1, 'cool': 0}"
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,"{'funny': 0, 'useful': 2, 'cool': 1}"
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,"{'funny': 0, 'useful': 0, 'cool': 0}"


Let's list all the columns to think about our options for feature engineering.

In [5]:
df_unprocessed.columns

Index(['business_id', 'categories', 'categories_new', 'city',
       'coordinates.latitude', 'coordinates.longitude', 'display_phone',
       'found', 'found2', 'full_address', 'hours', 'id', 'image_url', 'index',
       'is_claimed', 'is_closed', 'latitude', 'location.address1',
       'location.address2', 'location.address3', 'location.city',
       'location.country', 'location.cross_streets',
       'location.display_address', 'location.state', 'location.zip_code',
       'longitude', 'name', 'name_new', 'neighborhoods', 'open', 'phone',
       'photos', 'price', 'rating', 'review_count', 'review_count_new',
       'stars', 'state', 'transactions', 'type', 'url'],
      dtype='object')

Each encrypted business_id from the df_unprocessed dataframe has corresponding entries in the df_reviews dataframe with the same business_id.

In [6]:
# Reviews of the first entry of the df_unprocessed dataframe
df_reviews[df_reviews['business_id'] == df_unprocessed['business_id'][0]]

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
44151,PzOqRohWw7F7YEPBz6AubA,2011-12-05,BNzJxKw6xPVQFDaVIaqIxg,4,FINALLY a decent place for breakfast and lunch...,review,YLE6GoJ61fb0QBF7JTkaJg,"{'funny': 5, 'useful': 6, 'cool': 6}"
58080,PzOqRohWw7F7YEPBz6AubA,2011-12-08,AeNG9-zB6SeozTFRx6cL-A,4,Stopped in to grab a couple coffees and a bage...,review,A7B8l4bO15tVkpD-5TD2zQ,"{'funny': 0, 'useful': 0, 'cool': 2}"
67427,PzOqRohWw7F7YEPBz6AubA,2012-07-09,BOpB0T-LbmW2g2D09L7ZGQ,5,Absolutely love this place. The bagels are ma...,review,s5VnR2vM9zZAqchGQiJtpQ,"{'funny': 0, 'useful': 0, 'cool': 0}"
82361,PzOqRohWw7F7YEPBz6AubA,2012-12-10,q2UHNFulCcXdATnn7XxnlA,5,"You can not go wrong with this place, any bage...",review,31OZmHva7Q0dTNqpM4TKug,"{'funny': 0, 'useful': 0, 'cool': 0}"
101541,PzOqRohWw7F7YEPBz6AubA,2012-03-21,0VJ8CyG-411PkKOm2Q0BEQ,5,Love this place! I'm so glad they opened up i...,review,4iyTyF7726rzcg-LhP73-w,"{'funny': 0, 'useful': 0, 'cool': 0}"
105139,PzOqRohWw7F7YEPBz6AubA,2011-12-20,dT6tQnOc5Rz9ibM6UjDLLw,4,Good stuff...tho will overcharge you on items ...,review,GU6rcPRADsnXvaMFShimhQ,"{'funny': 0, 'useful': 0, 'cool': 0}"
120885,PzOqRohWw7F7YEPBz6AubA,2012-12-30,GS1ImaW939GCETew_wm8Qw,3,Food was okay. Service was good.,review,skry53Pog0dkLcOVMS2Haw,"{'funny': 0, 'useful': 0, 'cool': 0}"
122698,PzOqRohWw7F7YEPBz6AubA,2012-09-19,Zlz3mLJ3DIZtmwiiOzz9sg,4,Can't believe that I have not tried this place...,review,HeKxpZpPd_WweAuZSBZ_qg,"{'funny': 0, 'useful': 1, 'cool': 0}"
130027,PzOqRohWw7F7YEPBz6AubA,2011-12-05,gEOGanMecyDrUA4VR-5SMw,4,Am I back in NJ??? I almost thought so after p...,review,2LdmDvRyrXgFsVL1KoRfRA,"{'funny': 0, 'useful': 1, 'cool': 0}"
170534,PzOqRohWw7F7YEPBz6AubA,2012-11-28,8t-xdqF9D5AwC3nqBMuXvg,1,Went in one time. Bagels are way too salty!! L...,review,-HKgcpX2B-vAYjBmnT__4Q,"{'funny': 0, 'useful': 1, 'cool': 0}"


Now we can generate features using the reviews metadata.

In [7]:
from datetime import datetime
from sklearn.linear_model import LinearRegression

regr = LinearRegression()

df_unprocessed['reviews_per_week'] = np.nan
df_unprocessed['oldest_review'] = np.nan # expressed in days from today
df_unprocessed['std_of_stars'] = np.nan
df_unprocessed['median_of_stars'] = np.nan
df_unprocessed['stars_linear_coef'] = np.nan
df_unprocessed['reactions_per_week'] = np.nan


# 
for i, restaurant in df_unprocessed.iterrows():
    reviews = df_reviews[df_reviews['business_id'] == restaurant['business_id']]
    # difference between newest and oldest review in weeks
    delta_time = (max(reviews['date']) - min(reviews['date'])).days/7.
    nreviews = len(reviews)
    # proxy for oldness: weeks of oldest review since 2014 (all reviews are older than this)
    weeks_since_older_review = (datetime.strptime('Jan 1 2014','%b %d %Y')- min(reviews['date'])).days*7
    nreactions = 0
    for j, review in reviews.iterrows():
        nreactions += review['votes']['funny'] + review['votes']['useful'] + review['votes']['cool']
    dates = [d.toordinal() for d in reviews['date']] # convert dates to integer
    dates = np.asarray(dates)
    # fit linear regression model to find coefficient (do stars increase or decrease?)
    regr.fit(dates.reshape(-1,1),reviews['stars'].values.reshape(-1,1)) 
    
    #df_unprocessed.loc[i,'reviews_per_week'] = nreviews / delta_time
    
    # The following way of calculating frequency is better. Since restaurants with a small number of reviews
    # that came at the same period would seem like they get reviews very frequently
    df_unprocessed.loc[i,'reviews_per_week'] = nreviews / weeks_since_older_review
    df_unprocessed.loc[i,'oldest_review'] = weeks_since_older_review
    df_unprocessed.loc[i,'std_of_stars'] = np.std(reviews['stars'])
    df_unprocessed.loc[i,'median_of_stars'] = np.median(reviews['stars'])
    # same for this, use weeks_since_older review instead of delta_time
    df_unprocessed.loc[i,'reactions_per_week'] = nreactions / weeks_since_older_review
    df_unprocessed.loc[i,'stars_linear_coef'] = regr.coef_[0][0]
    
    if i % 100 == 0:
        print(i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300


Now we will make a dictionary of restaurant categories that will help us calculate relative quantities. The keys are the restaurant categories and the values are the indices of the restaurants that belong in each category.

In [8]:
# Create a dictionary of categories with the index of each corresponding restaurant
category_dic = {}
for i, restaurant in df_unprocessed.iterrows():
    for category in restaurant['categories_new']:
        alias = category['alias']
        try:
            category_dic[alias] += [i]
        except:
            category_dic[alias] = [i]

In [9]:
# make price numeric
df_unprocessed = df_unprocessed.replace({'price':{'$':1,'$$':2,'$$$':3,'$$$$':4}})

Here, we will calculate restaurant density within 1 mile radius and other relative quantities.  The z\* quantities are relative to the equivalent quantities of the surrounding restaurants.

In [15]:
# this function returns distance in miles given longitude and latitude of two locations
import math
def distance(ilat,jlat,ilong,jlong):
    R = 6371.e3 # earth radius in m
    phi1 = math.radians(ilat)
    phi2 = math.radians(jlat)
    deltaphi = math.radians(jlat-ilat)
    deltalambda = math.radians(jlong-ilong)
    a = math.sin(deltaphi/2)**2 + math.cos(phi1)*math.cos(phi2) * math.sin(deltalambda/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = R * c / 1609.34 # in miles
    return d

df_unprocessed['restaurant_density'] = np.nan
df_unprocessed['restaurant_similar_density'] = np.nan
df_unprocessed['zprice_all'] = np.nan
df_unprocessed['zreview_count_all'] = np.nan
df_unprocessed['zreview_per_week_all'] = np.nan
df_unprocessed['zstar_all'] = np.nan
df_unprocessed['is_chain'] = np.nan

for i, restaurant in df_unprocessed.iterrows():
    price_all_list = []
    review_count_all_list = []
    review_per_week_all_list = []
    star_all_list = []
    density_similar_list = []
    density_all_list = []
    
    ilong = restaurant['coordinates.longitude']
    ilat = restaurant['coordinates.latitude']
    for category in restaurant['categories_new']:
        alias = category['alias']
        for jrestaurant in category_dic[alias]:
            jlong = df_unprocessed.loc[jrestaurant]['coordinates.longitude']
            jlat = df_unprocessed.loc[jrestaurant]['coordinates.latitude']
            dist = distance(ilat,jlat,ilong,jlong)
            # if less than 1 mile (include self)
            # certain restaurants will be double counted if they belong to the same categories but
            # that's ok as it adds a weight for very similar restaurants
            if dist <= 1:
                density_similar_list += [jrestaurant]
  
    df_unprocessed.loc[i,'restaurant_similar_density'] = len(density_similar_list)
    
    # Loop over all resturants to calculate the "all" properties
    for j, jrestaurant in df_unprocessed.iterrows():
        jlong = jrestaurant['coordinates.longitude']
        jlat = jrestaurant['coordinates.latitude']
        dist = distance(ilat,jlat,ilong,jlong)
        # if less than 1 mile (include self in calculation of statistics)
        if dist <= 1:
            price_all_list += [jrestaurant['price']]
            review_count_all_list += [jrestaurant['review_count']]
            review_per_week_all_list += [jrestaurant['reviews_per_week']]
            star_all_list += [jrestaurant['stars']]
            density_all_list += [j]
        
        
    # price and stars are not divided by standard deviation because it is often equal to 0
    df_unprocessed.loc[i,'zprice_all'] = (restaurant['price']-np.nanmean(price_all_list))/4.
    df_unprocessed.loc[i,'zreview_count_all'] = (restaurant['review_count']-np.nanmean(review_count_all_list))/np.nanstd(review_count_all_list)
    df_unprocessed.loc[i,'zreview_per_week_all'] = (restaurant['reviews_per_week']-np.nanmean(review_per_week_all_list))/np.nanstd(review_per_week_all_list)
    df_unprocessed.loc[i,'zstar_all'] = (restaurant['stars']-np.nanmean(star_all_list))/5.
        
    df_unprocessed.loc[i,'restaurant_density'] = len(density_all_list)
    df_unprocessed.loc[i,'restaurant_similar_density'] = len(density_similar_list)
    # True if there are more than one
    df_unprocessed.loc[i,'is_chain'] = (len(df_unprocessed[df_unprocessed['name_new'] == restaurant['name_new']]) > 1)
    
    if i % 100 == 0:
        print(i)

0




100
200
300
400




500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300


Check if the new calculation for is_chain worked.

In [17]:
df_unprocessed[['name','is_chain']]

Unnamed: 0,name,is_chain
0,Hot Bagels & Deli,True
1,Jersey Mike's Subs,True
2,Sauce,True
3,Fuddruckers,True
4,McDonald's,True
5,Peter Piper Pizza,True
6,Panda Garden,False
7,Lenny's Burger Shop,True
8,Five Guys Burgers and Fries,True
9,Sonic Drive-In,True


In [18]:
df_unprocessed.to_pickle('./yelp_dataset_processed/yelp_df_more_features_all_final.pkl')

In [21]:
len(category_dic)

168