## 4.0. Load Packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import numpy as np

## 4.1. Load Business Category Data

In [2]:
# Get top-level business categories
with open("categories.json") as json_file:
    categories = json.load(json_file)

# Dictionary of category keys to the full strings that appear in the dataset
category_dict = {category["alias"]:category["title"] for category in categories}
category_dict_reverse = {category["title"]:category["alias"] for category in categories}

# Categories that we are interested in
#categories_of_interest = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']
categories_of_interest_raw = [category["alias"] for category in categories if category["parents"] == []]
categories_of_interest = [category_dict[cat] for cat in categories_of_interest_raw]

print(categories_of_interest)
print(categories_of_interest_raw)

['Active Life', 'Arts & Entertainment', 'Automotive', 'Beauty & Spas', 'Bicycles', 'Education', 'Event Planning & Services', 'Financial Services', 'Food', 'Health & Medical', 'Home Services', 'Hotels & Travel', 'Local Flavor', 'Local Services', 'Mass Media', 'Nightlife', 'Pets', 'Professional Services', 'Public Services & Government', 'Religious Organizations', 'Restaurants', 'Shopping']
['active', 'arts', 'auto', 'beautysvc', 'bicycles', 'education', 'eventservices', 'financialservices', 'food', 'health', 'homeservices', 'hotelstravel', 'localflavor', 'localservices', 'massmedia', 'nightlife', 'pets', 'professional', 'publicservicesgovt', 'religiousorgs', 'restaurants', 'shopping']


## 4.2. Business Data

### Load Dataset

In [3]:
businesses = pd.read_csv("YelpChallengeWMetros_Clean.csv", encoding='latin-1')
# businesses.head(3)

### Process Categories

In [4]:
# Create indicator columns for each business category (a business may belong to multiple categories)
# such as: is_restaurants or is_shopping
businesses_categories = businesses.copy()

for category_str in categories_of_interest:
    colname = "is_" + category_dict_reverse[category_str]
    businesses_categories[colname] = businesses_categories["categories"].str.contains(category_str).fillna(False)
businesses_categories.head(3)

Unnamed: 0,business,name,categories,stars,review_count,address,city,state,longitude,latitude,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,N3_Gs3DnX4k9SgpwJxdEfw,Lane Wells Jewelry Repair,"Shopping, Jewelry Repair, Appraisal Services, ...",5,30,"7801 N Lamar Blvd, Ste A140",Austin,TX,-97.711458,30.346169,...,False,True,False,False,False,False,False,False,False,True
1,NVfOn7TdnHbaGH97CVB_Qg,McKinley Chiropractic,"Chiropractors, Health & Medical",5,5,"5625 Eiger Rd, Ste 160",Austin,TX,-97.857409,30.244902,...,False,False,False,False,False,False,False,False,False,False
2,Xw8tuI30T-xihpzwBV-zJg,El Pollo Rey,"Food Trucks, Restaurants, Specialty Food, Food...",5,11,1725 E Riverside Dr,Austin,TX,-97.730141,30.243493,...,False,False,False,False,False,False,False,False,True,False


## 4.3. Counts Data (all metro areas combined) and num_businesses

### counts_df

In [5]:
#Takes the previously calculated metros_counts_final_v2.csv files and concatenates them into 1 file for analysis below
metros = ['Boston', 'Portland', 'Austin', 'Orlando', 'Atlanta', 'Vancouver', 'Columbus', 'Boulder']

counts_df = pd.DataFrame()


for metro in metros:

    suffix = "_v2.csv"
        
    counts_df_metro = pd.read_csv("output_urbcomp/" + metro + "_counts_final" + suffix)
    counts_df_metro["metroarea"] = metro
    counts_df = pd.concat([counts_df,counts_df_metro])
    
counts_df.to_csv("final_output_urbcomp/all_counts_final_v2.csv", index=False)

#### num_businesses

In [6]:
#Outputs a dataframe with 1 metro per row and a count of number of businesses and times a word (column) shows up in business reviews in that metro

categories_of_interest_small = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']
counts_df_mod_noprofs = counts_df.copy()
for col in ["is_"+col for col in categories_of_interest_small if col != "restaurants"]:
    counts_df_mod_noprofs[col] = np.logical_and(counts_df_mod_noprofs[col] == True, counts_df_mod_noprofs["is_restaurants"] == False)
cols = ["is_"+col for col in categories_of_interest_small]

df_rows = []
# categories_of_interest_small = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']

cols = [counts_df.columns[0]] + list(counts_df.columns[3:45])

for category in categories_of_interest_small:
    df_row = [category]
    q = "is_" + category
    counts_category = counts_df_mod_noprofs.query(q)
    num_bus_in_cat = len(counts_category)
    counts_df_temp = counts_category[cols[1:]]
    df_row.append(num_bus_in_cat)
    df_row.extend(np.count_nonzero(counts_df_temp, axis=0))
    df_rows.append(df_row)
    
num_businesses_df = pd.DataFrame(df_rows,columns = ["category", "num_businesses"] + cols[1:])
    
num_businesses_df.to_csv("final_output_urbcomp/num_businesses.csv", index=False)
num_businesses_df.head(3)

Unnamed: 0,category,num_businesses,aunt,bff,boo,boss,boyfriend,brother,child,classmate,...,sister,son,spouse,teacher,uncle,wife,family,romantic,friendship,professional
0,active,8726,69,25,10,63,1178,372,1906,38,...,688,1972,37,94,47,1739,5299,1963,2803,454
1,arts,4228,57,32,18,49,899,286,809,22,...,546,839,29,30,28,1082,2741,1400,1865,252
2,beautysvc,16154,269,108,29,159,2330,437,1078,19,...,2649,1856,68,17,28,2388,10033,3668,6561,981


## 4.4. counts_df_norm and table

#### Initial Data Prep

In [7]:
#Takes the previously defined metro_counts_norm_final_v2.csv files and appends them all together

metros = ['Boston', 'Portland', 'Austin', 'Orlando', 'Atlanta', 'Vancouver', 'Columbus', 'Boulder']

counts_df_norm = pd.DataFrame()

for metro in metros:
    suffix = "_v2.csv"
        
    counts_df_norm_metro = pd.read_csv("output_urbcomp/" + metro + "_counts_norm_final" + suffix)
    counts_df_norm_metro["metroarea"] = metro
    counts_df_norm = pd.concat([counts_df_norm,counts_df_norm_metro])
    
counts_df_norm.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,buF9druCkbuXLX526sGELQ,84,250.0,0.0,0.0,0.0,0.0,23.809524,0.0,0.0,...,False,False,False,True,False,False,False,False,True,False
1,RA4V8pr014UyUbDvI-LW2A,60,66.666667,0.0,0.0,0.0,0.0,0.0,0.0,33.333333,...,False,False,False,False,False,False,False,False,False,True
2,xGXzsc-hzam-VArK6eTvtw,324,225.308642,0.0,0.0,0.0,0.0,24.691358,3.08642,3.08642,...,False,False,False,True,False,False,False,False,True,False


#### Table

In [8]:
metros = ['Boston', 'Portland', 'Austin', 'Orlando', 'Atlanta', 'Vancouver', 'Columbus', 'Boulder']
metros = sorted(metros)
cols = list(counts_df_norm.columns[3:41])

In [9]:
#Outputs a full table of normalized counts of relationship words per metro and overall, to a text document

df = pd.DataFrame(counts_df_norm[cols].mean().sort_values(ascending=False))

for metro in metros:
    counts_df_norm_metro = pd.read_csv("output_urbcomp/" + metro + "_counts_norm_final_v2.csv")
    df[metro] = pd.DataFrame(counts_df_norm_metro[cols].mean().sort_values(ascending=False))
df = df.reset_index()
df = df.rename(columns={"index": "Word", 0: "Total"})
df.head(3)

Unnamed: 0,Word,Total,Atlanta,Austin,Boston,Boulder,Columbus,Orlando,Portland,Vancouver
0,friend,33.804432,38.170363,29.798229,36.803044,27.865674,31.683689,25.935353,29.254275,50.024824
1,husband,25.995269,23.441352,27.714613,23.072557,26.335683,33.492668,33.658416,26.555384,17.08661
2,wife,17.656021,14.339346,17.078982,16.194896,18.335786,25.496619,24.312638,17.524192,12.630865


In [10]:
df.to_latex(buf="output_urbcomp/table.txt", float_format="%.2f", index=False)

  df.to_latex(buf="output_urbcomp/table.txt", float_format="%.2f", index=False)


#### counts_df_norm file

In [11]:
categories_of_interest = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']
#creates logic statement for the following query statement, to get those rows where the columns selected are TRUE in at least 1 of the columns
q = " or ".join(["is_"+cat for cat in categories_of_interest])
print(q)

is_active or is_arts or is_beautysvc or is_food or is_hotelstravel or is_nightlife or is_restaurants or is_shopping


In [12]:
#If a business is registered as is_shopping=True and is_restaurants=True, this sets is_shopping=False
#Basically makes it so a restaurant is not able to be counted as another category

final_table = counts_df_norm.query(q).copy()

for col in ["is_"+col for col in categories_of_interest if col != "restaurants"]:
    final_table[col] = np.logical_and(final_table[col] == True, final_table["is_restaurants"] == False)

In [13]:
final_table.to_csv("final_output_urbcomp/all_counts_norm_final_v2.csv", index=False)

#### counts_df_norm_formaps file

In [16]:
#Takes just the columns from final_table that are used for mapping and outputs them into a separate dataset
cols_of_interest = ['business_id', 'num_reviews', 'num_relationship_words',
                    'family', 'romantic', 'friendship', 'professional',
                    'name', 'categories', 'stars', 'review_count',
                    'address', 'city', 'state', 'longitude', 'latitude', 'metroarea']

cols_of_interest.extend(["is_"+cat for cat in categories_of_interest])

In [17]:
counts_df_norm_fewcols = final_table[cols_of_interest]
counts_df_norm_fewcols.to_csv("final_output_urbcomp/all_counts_norm_fewcols_formaps.csv", index=False)