# Data Comprehensiveness

Problem: We need a comprehensive set of queries for the Google Maps scraper to extract an exhaustive list of activities from Berkeley

Get data scraper running locally:
https://github.com/gosom/google-maps-scraper

Cities | Categories
--- | ---
cities.csv | categories.csv

Desired categories need to be specific (i.e. no atms, religious institutions) and exhaustive (i.e. all restaurant subtypes).
Queries will be generated through cross-matching '[category] near [city], CA'.

In [315]:
# Imports
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
from itertools import product

InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows', 4000)

In [298]:
categories = pd.read_table('categories.csv', header=None)
categories = categories.rename(columns={0:'Category'})
categories.insert(1, "Primary", 0, True)
categories.insert(2, "Secondary", 0, True)
categories.insert(3, "Remove", 0, True)
categories.insert(4, "Unclassified", 1, True)
categories

Unnamed: 0,Category,Primary,Secondary,Remove,Unclassified
0,3d printing service,0,0,0,1
1,abarth dealer,0,0,0,1
2,abbey,0,0,0,1
3,aboriginal and torres strait islander organisa...,0,0,0,1
4,aboriginal art gallery,0,0,0,1
...,...,...,...,...,...
5497,youth organization,0,0,0,1
5498,youth social services organization,0,0,0,1
5499,yucatan restaurant,0,0,0,1
5500,zhejiang restaurant,0,0,0,1


In [299]:
samples_count = {'Restaurant': len(categories[categories["Category"].str.contains("restaurant")]),
                'Stop': len(categories[categories["Category"].str.contains("stop")]),
                'Company': len(categories[categories["Category"].str.contains("company")]),
                'Store': len(categories[categories["Category"].str.contains("store")]),
                'Shop':len(categories[categories["Category"].str.contains("store")]),
                'Station': len(categories[categories["Category"].str.contains("station")]),
                'Service': len(categories[categories["Category"].str.contains("service")]),
                'Agency': len(categories[categories["Category"].str.contains("agency")]),}
samples_count

{'Restaurant': 367,
 'Stop': 8,
 'Company': 99,
 'Store': 379,
 'Shop': 379,
 'Station': 45,
 'Service': 470,
 'Agency': 90}

In [307]:
# Basics: remove all obvious stupid options: parkING (issue), therapist
# Note: car rental agencies removed
# School a bit of a grey area (i.e. ski school) - removing for now
# SERVICE, CENTER

remove_terms = ["company", "parking", "car ", "agency", "church", "firm", "dealer", "auction", "manufactur", 
                "school", "bank", "atm", "facility", "hostel", "clinic", "supplier", "religious", "cleaning",
               "group", "ist", "academy", "tant", "department", "building", "information", "bureau", "office",
               "police", "station", "air", "service", "ag", "broker", "factory", "centre"]
secondary_terms = ["shop", "store", "boutique", "ceramics", "charcuterie", "butcher", "clothing", "cosmetics", "couture",
                  "delicatessen", "fashion", "flea market", "market", "nail salon", "patisserie", "textiles"]
primary_terms = ["restaurant", "club", "museum", "class", " bar", " park", "adventure", "studio", "rental",
                "tour", "theater", "hall", "arena", "studio", "venue", "bakery", " field", "badminton",
                "basketball", "baseball", "squash", "tennis", "cafe", "coffee shop", "cinema", "diner", "race"]
terms_group = {"Remove": remove_terms, "Secondary": secondary_terms, "Primary": primary_terms}

for col, content in terms_group.items():
    for term in content: # get each word
        categories.loc[categories["Category"].str.contains(term), col] = 1 # set to 1 in proper category
        categories.loc[categories["Category"].str.contains(term), "Unclassified"] = 0 # set to 0 in unclassified
        for other_col in terms_group.keys():
            if other_col != col:
                categories.loc[categories["Category"].str.contains(term), other_col] = 0 # 0 for rest

manual_terms_raw = """spa, spa and health club, wellness hotel, pub, rock climbing, sauna, gay sauna, ski, water ski, stadium, swimming pool,
                     aquarium, boxing ring, boxing gym, boxing club, botanical garden, campground, castle, culture, dancing, exhibit, facial spa, 
                     football pitch, fraternal organization, gambling house, garden, golf course, golf driving range, horse riding, ice skating rink, 
                     irish pub, mountain bike, nightlife, pier, planetarium, pub, sauna, ski, ski jumping hill, surf spot, town, wellness, winery,
                     wine cellar, zoo, bar, park, escape room center, amusement center, garden center, historic city center, indoor snowcenter, 
                     laser tag center, meditation center, recreation center, skydiving center, wilderness center, art center, aerial sports center, 
                     amphitheater, magician, massage, shooting range, rodeo, shooting range, ballroom, bbq area, beach volleyball court, beer garden, 
                     biking trail, bmx track, bocce ball court, casino, festival"""
manual_terms = manual_terms_raw.replace("\n                     ", "").split(", ")
for term in manual_terms: # get each word
    categories.loc[categories["Category"].str.contains(term), "Primary"] = 1 # set to 1 in proper category
    categories.loc[categories["Category"].str.contains(term), "Unclassified"] = 0 # set to 0 in unclassified
    for other_col in terms_group.keys():
        if other_col != "Primary":
            categories.loc[categories["Category"].str.contains(term), other_col] = 0 # 0 for rest


In [312]:
CHECK_WORD = "rodeo"
categories[categories["Category"].str.contains(CHECK_WORD)]
len(categories[categories["Category"].str.contains(CHECK_WORD)])

Unnamed: 0,Category,Primary,Secondary,Remove,Unclassified
4198,rodeo,1,0,0,0


1

In [310]:
len(categories.loc[categories["Primary"] == 1])
categories.loc[categories["Primary"] == 1]

1044

Unnamed: 0,Category,Primary,Secondary,Remove,Unclassified
1,abarth dealer,1,0,0,0
9,acaraje restaurant,1,0,0,0
34,adult entertainment club,1,0,0,0
37,adventure sports,1,0,0,0
38,adventure sports center,1,0,0,0
39,adventure sports centre,1,0,0,0
45,aerial sports center,1,0,0,0
46,aero dance class,1,0,0,0
48,aeroclub,1,0,0,0
52,aerospace company,1,0,0,0


In [317]:
cities = pd.read_table('cities.csv', header=None)
cities = cities.rename(columns={0:'City'})

combinations = product(categories['Category'], cities['City'])
df_combinations = pd.DataFrame([f'{activity} in {city}' for activity, city in combinations], columns=['Activity in City'])
df_combinations

Unnamed: 0,Activity in City
0,3d printing service in San Francisco
1,3d printing service in Palo Alto
2,3d printing service in San Jose
3,3d printing service in Mountain View
4,3d printing service in Sunnyvale
...,...
1034371,zoo in Bethel Island
1034372,zoo in Loma Mar
1034373,zoo in Stewarts Point
1034374,zoo in Discovery Bay


Side note: this might be stupid ah

In [318]:
df_combinations.to_csv('activity_in_city.csv', index=False) 