# ENSF 544 Data Science for Software Engineers
## Phase 1
#### By: Kyle Friedt, Erslan Salman, Kelvin Tran and Avneet Gill

In [4]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

# returns array of addresses and postal codes of restuarants 
def get_addresses(url):
    addresses = []
    postalcodes = []
    
    zomato_html = get_page_html(url)
    parser = BeautifulSoup(zomato_html, 'html.parser')
    address_tag = parser.find_all("div", class_ = "col-m-16 search-result-address grey-text nowrap ln22")
    
    for address in address_tag:
        postal = None
        temp  = address.get_text()
        temp = temp.strip("\n")
        temp = temp.strip()
        
        #looks for 6 digit postal code in Canadian format
        if re.search('[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d$',temp) != None:
            postal = re.findall('[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d$',temp)[0]
            temp = temp.replace(postal,"")
            postal = postal.replace(" ","")
            postal = postal.replace("-","")
            
        #looks for 3 digit postal code and removes it
        elif re.search('[A-Za-z]\d[A-Za-z]$',temp) != None:
            postal = re.findall('[A-Za-z]\d[A-Za-z]',temp)[0]
            temp = temp.replace(postal,"")
            
        addresses.append(temp)
        postalcodes.append(postal)
 
    return addresses, postalcodes
    
def get_titles(url):
    titles = []

    zomato_html = get_page_html(url)
    parser = BeautifulSoup(zomato_html, 'html.parser')
    
    title_tag = parser.find_all("a", class_ = "result-title" )
    
    for title in title_tag:
        temp  = title.get_text()
        temp = temp.strip("\n")
        temp = temp.strip()
        titles.append(temp)
    
    return titles


# gets the average rating and the number of ratings for each restaurant on the page
def get_rating(page_url):
    zomato_html = get_page_html(page_url)
    parser = BeautifulSoup(zomato_html, 'html.parser')
    
    rows = parser.find_all("div", class_="search-snippet-card")
    rating_list = []
    num_rating_list = []
    for row in rows:
        rating = row.find("span", class_="rating-value")
        if rating is not None:
            rating = float(rating.text)
        num_ratings = row.find("span", class_="review-count")
        if num_ratings is not None:
            num_ratings = int(re.search("([0-9]*)",num_ratings.text.strip().replace(',','').replace("(",'')).group(1))
        rating_list.append(rating)
        num_rating_list.append(num_ratings)
    
    return rating_list, num_rating_list

# gets an array of cuisine types for each restaurant on the page
def get_cuisines(page_url):
    zomato_html = get_page_html(page_url)
    parser = BeautifulSoup(zomato_html, 'html.parser')
    
    rows = parser.find_all("div", class_="search-snippet-card")
    cuisine_list = []
    
    for row in rows:
        cuisines = []
        cuisine_tag = row.find("span", text = "Cuisines: ")
        #print(cuisine_tag)
        #print('test')
        if cuisine_tag is not None:
            for cuisine in cuisine_tag.parent.find_all('a'):
                #print(cuisine.text)
                #print('test')
                cuisines.append(cuisine.text)
                
        cuisine_list.append(cuisines)
        #print(cuisine_list)
        #print('test')
    return cuisine_list

    
def get_next_page(parser):
    links_array = []
    
    next_page = parser.find("a", class_="paginator_item next item")
    next_page = "https://www.zomato.com" + str(next_page['href'])
    next_page = next_page.split("=")
    stripped = next_page[0]
    return stripped

def get_number_of_pages(parser):
    next_page = parser.find("div", class_="col-l-4 mtop pagination-number")
    next_page = next_page.find_all('b')
    number = next_page[1].get_text()
    return number 

def get_all_links(url,number):
    links = []
    links.append(url)
    url_ = url + "="
    
    for x in range(2,int(number)+1):
        new_link = url_+str(x)
        links.append(new_link)
        
    return links
    

def get_page_html(url):    
    headers = {"User-Agent":"Mozilla/5.0"}
    source=requests.get(url, headers=headers).text
    return source


def get_resturant_data(parser):
    url = get_next_page(parser)
    number = get_number_of_pages(parser)
    links = get_all_links(url, number)
    titles = []
    addresses = []
    postal_codes = []
    ratings = []
    num_ratings = []
    cuisines = []
    for link in links:
        title = get_titles(link)
        titles = titles + title
        
        address, postal = get_addresses(link)
        addresses.extend(address)
        postal_codes.extend(postal)
        
        rating,num_rating = get_rating(link)
        ratings.extend(rating)
        num_ratings.extend(num_rating)
        
        cuisine = get_cuisines(link)
        cuisines.extend(cuisine)
        
        
        
        print("loading...")
    
    df = pd.DataFrame(titles, columns = ['name'])
    df['address'] = addresses
    df['postal_code'] = postal_codes
    df['rating'] = ratings
    df['num_ratings'] = num_ratings
    df['cuisines'] = cuisines
    return df

url = "https://www.zomato.com/calgary/restaurants"
zomato_html = get_page_html(url)
parser = BeautifulSoup(zomato_html, 'html.parser')

zomato_df = get_resturant_data(parser)
zomato_df

loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...
loading...

Unnamed: 0,name,address,postal_code,rating,num_ratings,cuisines
0,OEB Breakfast Co.,"825 1 Ave NE, Bridgeland, Calgary",T2E3J6,4.9,1463.0,"[American, Diner]"
1,Una Pizza and Wine,"618 17 Avenue SW, Calgary",T2S0B4,4.6,1708.0,"[Italian, Pizza]"
2,NOtaBLE,"4611 Bowness Rd NW, Calgary",T3B0S4,4.4,1297.0,[Canadian]
3,Charcut Roast House,"899 Centre St S, Calgary",T2G1B8,4.4,1464.0,"[European, French, Tapas]"
4,Blue Star Diner,"809 1 Ave NE, Calgary",T2E0C1,4.6,967.0,"[Breakfast, Burger]"
...,...,...,...,...,...,...
2994,Masala Twistz,"175 Chestermere Station Way, Chestermere",T1X1V3,,,[Indian]
2995,Little Caesars,"100 Marina Drive, Chestermere",T1X1N2,,,[Pizza]
2996,Edo Japan,"175 Chestermere Station Way, Suite 504, Cheste...",T1X0A4,,,"[Fast Food, Japanese]"
2997,Balzac Diner,"10070 Highway 566, Balzac, Alberta",,,,[Canadian]


### Run this to generate CSV from Zomato DF

In [13]:
zomato_df.to_csv("zomato.csv",index = False)

### Run this if you already have zomato.csv on your local Machine and dont want to query again

In [14]:
zomato_csv = pd.read_csv("zomato.csv")
zomato_csv['cuisines'] = zomato_csv['cuisines'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').split(','))

zomato_df = zomato_csv
zomato_df

Unnamed: 0,name,address,postal_code,rating,num_ratings,cuisines
0,Tres Carnales Taqueria,"10119 100A Street, Edmonton",,4.5,1656.0,[Mexican]
1,Corso 32,"10345 Jasper Avenue, Edmonton",T5J1Y7,4.5,851.0,[Italian]
2,Pampa Brazilian Steakhouse,"9929 109 St NW, Edmonton",T5K 1H6,4.2,958.0,"[Brazilian, International, Steak]"
3,Sabor Restaurant,"10220-103 Street, Edmonton",T5J0Y8,4.6,560.0,"[Seafood, Spanish, Portuguese]"
4,Sofra,10345 106th Street NW,T5J 0J2,4.5,393.0,[Mediterranean]
...,...,...,...,...,...,...
2060,Bloom & Bliss Flower and Coffee Shop,"5115 50 St, Evansburg",T0E0T0,,,[Coffee and Tea]
2061,Lily's Steak & Pizza,"4803 Queen St, Bruderheim",T0B0S0,3.5,5.0,"[Pizza, Steak]"
2062,H&D Burger Barn,"4620 50 St, Millet",T0C1Z0,,,"[Burger, Fast Food, Sandwich]"
2063,Evansburg Snackbar,"4918 50th Ave, Evansburg",T0A3A0,,,"[American, Burger, Fish and Chips]"


In [1]:
# open the json file
def get_yelp_json():
    path = 'business.json'
    df = pd.read_json(path, lines=True)
    print(df.shape)
    return df    

In [2]:
# initial cleaning get businesses that are open, in Calgary, and have > 2 stars
# also get relevent columns
def initial_yelp_clean(df):
    df = df[df.is_open == 1]
    df = df[df.city == 'Calgary']
    df = df.loc[:, ('name', 'city', 'address', 'postal_code', 'stars', 
                    'review_count', 'categories')]# .copy()
    df = df[df.stars > 2]
    print(df.shape)
    return df

In [3]:
# get rid of businesses with empty categories
# get rid of businesses without postal code (using postal code for merge)
def drop_yelp_nan(df):
    df['categories'].replace('', np.nan, inplace=True)
    df['postal_code'].replace('', np.nan, inplace=True)
    df = df.dropna()
    print(df.shape)
    return df

In [4]:
# Get businesses that only have Restaurant or Food as a category
def get_restaurants(df):
    tags = ['Restaurants', 'Food']
    mask = df.categories.apply(lambda x: any(item for item in tags if item in x))
    df = df[mask]
    print(df.shape)
    return df

In [5]:
# Exclude rows that have category tags that we don't want
def drop_tags(df):
    tags = ['Gas Stations', 'Convenience Stores', 'Grocery', 'Gyms', 
            'Hunting & Fishing Supplies']
    mask = df.categories.apply(lambda x: not any(item for item in tags if item in x))
    df = df[mask]
    print(df.shape)
    return df

In [6]:
# Get rid of businesses that have the name Liqour in it
def drop_liquor(df):
    df = df[~df['name'].str.contains('Liquor')]
    print(df.shape)
    return df

In [7]:
def clean_postal(x):
    if len(x) < 4:
        x = np.nan
    else:
        x = re.sub(' ', '', x)
    return x

def clean_postal_code(df):
    df.postal_code = df.postal_code.apply(clean_postal)
    return df

In [13]:
def clean_name(x):
    x = re.sub('&', 'and', x)
    x = x.lower()
    x = re.sub(r'[^\w\s]', '', x)
    return x

def clean_name_column(df):
    df.name = df.name.apply(clean_name)
    return df

In [10]:
import pandas as pd
import numpy as np
import re
# funciton calls to get json data and generate a clean dataframe

# generate dataframe with necessary columns
yelp_df = get_yelp_json()
yelp_df = initial_yelp_clean(yelp_df)
yelp_df = drop_yelp_nan(yelp_df)

# clean the data to exclude businesses that are not restaurants
yelp_df = get_restaurants(yelp_df)
yelp_df = drop_tags(yelp_df)
yelp_df = drop_liquor(yelp_df)
yelp_df = clean_postal_code(yelp_df)
yelp_df = clean_name_column(yelp_df)
yelp_df.to_csv('yelp.csv')
yelp_df

(209393, 14)
(5720, 7)
(5673, 7)
(2654, 7)
(2479, 7)
(2452, 7)
(2452, 7)


NameError: name 'clean_name_column' is not defined

### Run this if you want to read yelp Dframe from local csv 

In [None]:
yelp_csv = pd.read_csv("yelp.csv")
yelp_csv['categories_arr'] = yelp_csv['categories'].apply(lambda x: x.replace("'",'').split(','))
yelp_csv

In [15]:
merged = zomato_df.merge(yelp_df, how = "inner", left_on = ["name","postal_code"],right_on = ["name","postal_code"])
merged

Unnamed: 0,name,address_x,postal_code,rating,num_ratings,cuisines,city,address_y,stars,review_count,categories
0,Charcut Roast House,"899 Centre St S, Calgary",T2G1B8,4.4,1464.0,"[European, French, Tapas]",Calgary,101-899 Centre Street S,4.0,328,"Restaurants, Steakhouses"
1,The Himalayan,3218 17th Avenue SW,T3E0B3,4.9,748.0,"[Asian, Indian]",Calgary,3218 17 Avenue SW,4.5,234,"Restaurants, Himalayan/Nepalese"
2,The Coup,"924 17 Avenue SW, Calgary",T2T0A2,4.6,1113.0,"[Vegetarian, Desserts]",Calgary,924 17th Avenue SW,4.0,235,"Cocktail Bars, Vegetarian, Restaurants, Bars, ..."
3,Pfanntastic Pannenkoek Haus,"2439 54 Ave SW, Calgary",T3E1M4,4.7,940.0,[Cafe],Calgary,"Lincoln Park Shopping Centre, 2439 54 Avenue SW",4.5,126,"Restaurants, Breakfast & Brunch, Scandinavian"
4,Model Milk,"308 17 Ave SW, Calgary",T2S0A3,4.2,832.0,"[Seafood, Tapas, Fusion]",Calgary,308 17th Avenue,4.0,234,"American (New), Restaurants, Food, Canadian (New)"
...,...,...,...,...,...,...,...,...,...,...,...
430,Lindt Chocolate Shop,"4th Street SW, Suite 1403, Calgary",T2R0Y2,,,[Desserts],Calgary,"1403-4th Street South West, Suite 1403",5.0,16,"Food, Chocolatiers & Shops, Specialty Food, Ca..."
431,Rosso Coffee Roasters,"2102 Centre Street NE, Calgary",T2E2T3,,,"[Cafe, Coffee and Tea]",Calgary,2102 Centre Street NE,4.0,10,"Coffee & Tea, Coffee Roasteries, Bakeries, Food"
432,The Chopped Leaf,"4101-35 Mackenzie Way SW, Airdrie",,3.7,57.0,[Healthy Food],Calgary,"4916 130th Avenue SE, Unit 226",3.0,4,"Soup, Salad, Restaurants, Sandwiches"
433,The Chopped Leaf,"4101-35 Mackenzie Way SW, Airdrie",,3.7,57.0,[Healthy Food],Calgary,"8650 112 Avenue NW, Suite 7107",4.5,25,"Sandwiches, Restaurants, Salad, Soup"


In [None]:
def weighted_avg_rating(df):
    df['total_ratings'] = df.num_ratings + df.review_count
    df['yelp_wt'] = df.review_count / df.total_ratings
    df['zomato_wt'] = df.num_ratings / df.total_ratings
    df['y_wt_rate'] = df.yelp_wt * df.stars
    df['z_wt_rate'] = df.zomato_wt * df.ratings
    df['avg_wt'] = df.z_wt_rate + df.y_wt_rate
    # TODO drop unused columns
    return df


w_df = weighted_avg_rating(yelp_df)
w_df