# Text Mining

## 3.0. Load Packages

In [1]:
# Import packages to use later
import pandas as pd
import nltk
import time
import json

## 3.1. Process Data

In [2]:
#metros = ['Boston', 'Portland', 'Austin', 'Orlando', 'Atlanta', 'Vancouver', 'Columbus', 'Boulder']
metro = 'Boulder'
relationship_df_raw = pd.read_csv("output_urbcomp/" + metro + "_counts_v2.csv")

In [3]:
relationship_df_raw.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,partner,professor,relationship,siblings,sister,son,spouse,teacher,uncle,wife
0,8zehGz9jnxPqXtOc7KaJxA,931,187,2,0,0,1,16,1,0,...,5,0,0,0,2,9,0,0,0,37
1,i9BDFBYcl_PGqrLbQUdMvg,207,22,0,0,0,0,3,0,0,...,0,0,0,0,1,0,0,0,0,1
2,GZgOQOlFVLgNMHBBjspHRw,43,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Put relationships into bins ("romantic", "family", etc.) for aggregation
with open("relationships/Relationships_ATUS_custom_v2_binned_update.txt") as f:
    relationships_binned_raw = [line.strip().lower() for line in f.readlines()]

relationships_binned = dict()
for line in relationships_binned_raw:
    rel_type, rel_list = line.split(":")
    relationships_binned[rel_type] = rel_list.split(",")
print(relationships_binned)

{'family': ['child', 'daughter', 'son', 'parent', 'mother', 'father', 'brother', 'sister', 'siblings', 'aunt', 'uncle', 'niece', 'nephew', 'cousin', 'grandchild', 'grandmother', 'grandfather', 'grandparents'], 'romantic': ['partner', 'relationship', 'date', 'boo', 'fiancee', 'girlfriend', 'boyfriend', 'spouse', 'husband', 'wife'], 'friendship': ['bff', 'friend', 'housemate', 'neighbor'], 'professional': ['classmate', 'professor', 'teacher', 'coworker', 'client', 'boss']}


In [5]:
# Compute counts for relationship category bins
relationship_df = relationship_df_raw.copy()

for rel_type, rel_list in relationships_binned.items():
    relationship_df[rel_type] = relationship_df[rel_list].sum(axis=1)

relationship_df.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,sister,son,spouse,teacher,uncle,wife,family,romantic,friendship,professional
0,8zehGz9jnxPqXtOc7KaJxA,931,187,2,0,0,1,16,1,0,...,2,9,0,0,0,37,39,115,30,3
1,i9BDFBYcl_PGqrLbQUdMvg,207,22,0,0,0,0,3,0,0,...,1,0,0,0,0,1,2,7,13,0
2,GZgOQOlFVLgNMHBBjspHRw,43,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
# Compute normalized counts
relationship_df_norm = relationship_df.copy()
relationship_df_norm = relationship_df_norm[relationship_df_norm["num_reviews"] >= 30]
relationship_df_norm[relationship_df_norm.columns[2:]] = relationship_df_norm[relationship_df_norm.columns[2:]].div(relationship_df_norm.num_reviews, axis=0) * 1000

relationship_df_norm.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,sister,son,spouse,teacher,uncle,wife,family,romantic,friendship,professional
0,8zehGz9jnxPqXtOc7KaJxA,931,200.859291,2.148228,0.0,0.0,1.074114,17.185822,1.074114,0.0,...,2.148228,9.667025,0.0,0.0,0.0,39.742213,41.89044,123.523093,32.223416,3.222342
1,i9BDFBYcl_PGqrLbQUdMvg,207,106.280193,0.0,0.0,0.0,0.0,14.492754,0.0,0.0,...,4.830918,0.0,0.0,0.0,0.0,4.830918,9.661836,33.816425,62.801932,0.0
2,GZgOQOlFVLgNMHBBjspHRw,43,23.255814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.255814,0.0


In [7]:
# Compute percentages for each relationship category bin
for relationship in relationships_binned.keys():
    newcol = relationship + "_pct"
    relationship_df[newcol] = relationship_df[relationship].div(relationship_df["num_relationship_words"])

relationship_df.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,uncle,wife,family,romantic,friendship,professional,family_pct,romantic_pct,friendship_pct,professional_pct
0,8zehGz9jnxPqXtOc7KaJxA,931,187,2,0,0,1,16,1,0,...,0,37,39,115,30,3,0.208556,0.614973,0.160428,0.016043
1,i9BDFBYcl_PGqrLbQUdMvg,207,22,0,0,0,0,3,0,0,...,0,1,2,7,13,0,0.090909,0.318182,0.590909,0.0
2,GZgOQOlFVLgNMHBBjspHRw,43,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0.0,0.0,1.0,0.0


### Businesses

In [8]:
# Get top-level business categories
with open("categories.json") as json_file:
    categories = json.load(json_file)

# Dictionary of category keys to the full strings that appear in the dataset
category_dict = {category["alias"]:category["title"] for category in categories}
category_dict_reverse = {category["title"]:category["alias"] for category in categories}

# Categories that we are interested in
#categories_of_interest = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'localflavor', 'nightlife', 'restaurants', 'shopping']
categories_of_interest_raw = [category["alias"] for category in categories if category["parents"] == []]
categories_of_interest = [category_dict[cat] for cat in categories_of_interest_raw]

print(categories_of_interest)
print(categories_of_interest_raw)

['Active Life', 'Arts & Entertainment', 'Automotive', 'Beauty & Spas', 'Bicycles', 'Education', 'Event Planning & Services', 'Financial Services', 'Food', 'Health & Medical', 'Home Services', 'Hotels & Travel', 'Local Flavor', 'Local Services', 'Mass Media', 'Nightlife', 'Pets', 'Professional Services', 'Public Services & Government', 'Religious Organizations', 'Restaurants', 'Shopping']
['active', 'arts', 'auto', 'beautysvc', 'bicycles', 'education', 'eventservices', 'financialservices', 'food', 'health', 'homeservices', 'hotelstravel', 'localflavor', 'localservices', 'massmedia', 'nightlife', 'pets', 'professional', 'publicservicesgovt', 'religiousorgs', 'restaurants', 'shopping']


In [9]:
# Create indicator columns for each business category (a business may belong to multiple categories)
businesses = pd.read_csv("YelpChallengeWMetros_Clean.csv", encoding='latin-1')
businesses = businesses.rename(columns={"business": "business_id"})

for category_str in categories_of_interest:
    colname = "is_" + category_dict_reverse[category_str]
    businesses[colname] = businesses["categories"].str.contains(category_str).fillna(False)
businesses.head(3)

Unnamed: 0,business_id,name,categories,stars,review_count,address,city,state,longitude,latitude,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,N3_Gs3DnX4k9SgpwJxdEfw,Lane Wells Jewelry Repair,"Shopping, Jewelry Repair, Appraisal Services, ...",5,30,"7801 N Lamar Blvd, Ste A140",Austin,TX,-97.711458,30.346169,...,False,True,False,False,False,False,False,False,False,True
1,NVfOn7TdnHbaGH97CVB_Qg,McKinley Chiropractic,"Chiropractors, Health & Medical",5,5,"5625 Eiger Rd, Ste 160",Austin,TX,-97.857409,30.244902,...,False,False,False,False,False,False,False,False,False,False
2,Xw8tuI30T-xihpzwBV-zJg,El Pollo Rey,"Food Trucks, Restaurants, Specialty Food, Food...",5,11,1725 E Riverside Dr,Austin,TX,-97.730141,30.243493,...,False,False,False,False,False,False,False,False,True,False


## 3.2. Final Merged Counts Dataframes

In [10]:
relationship_df_final = pd.merge(relationship_df, businesses, how="inner", on="business_id")
relationship_df_final.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,8zehGz9jnxPqXtOc7KaJxA,931,187,2,0,0,1,16,1,0,...,False,False,False,False,False,False,False,False,True,False
1,i9BDFBYcl_PGqrLbQUdMvg,207,22,0,0,0,0,3,0,0,...,False,False,False,False,False,False,False,False,True,False
2,GZgOQOlFVLgNMHBBjspHRw,43,1,0,0,0,0,0,0,0,...,False,False,False,True,False,False,False,False,True,False


In [11]:
relationship_df_norm_final = pd.merge(relationship_df_norm, businesses, how="inner", on="business_id")
relationship_df_norm_final.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,8zehGz9jnxPqXtOc7KaJxA,931,200.859291,2.148228,0.0,0.0,1.074114,17.185822,1.074114,0.0,...,False,False,False,False,False,False,False,False,True,False
1,i9BDFBYcl_PGqrLbQUdMvg,207,106.280193,0.0,0.0,0.0,0.0,14.492754,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,GZgOQOlFVLgNMHBBjspHRw,43,23.255814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,True,False,False,False,False,True,False


In [12]:
# Write final CSV files
relationship_df_final.to_csv("output_urbcomp/" + metro + "_counts_final_v2.csv", index=False)
relationship_df_norm_final.to_csv("output_urbcomp/" + metro + "_counts_norm_final_v2.csv", index=False)