In [1]:
import sys
import csv

from pathlib import Path

import psycopg2

import pandas as pd

sys.path.append('../../')
from utils import get_config

In [2]:
resource_p = Path("../resources")

# unhealthy food biz files
hdf_p = resource_p / "partial_unhealthy_food_biz.csv"
hdf_noise_p = resource_p / "noise_unhealthy_food_biz.csv"

# all food biz
all_food_p = resource_p / "partial_all_food_biz.csv"
noise_all_p = resource_p / "noise_all_food_biz.csv"

In [3]:
def read_categories_csv(p):
    with open(p) as csvfile:
        reader = csv.reader(csvfile)
        res = []
        for row in reader:
            res.append(row[0])
    return res

In [4]:
nourish_user = get_config("nourish_db","username")
nourish_pswd = get_config("nourish_db","passkey")

conn = psycopg2.connect(
    host="awesome-hw.sdsc.edu",
    database="nourish",
    user=nourish_user,
    password=nourish_pswd)

<configparser.ConfigParser object at 0x11d30ef10>
<configparser.ConfigParser object at 0x11d30eeb0>


In [5]:
# Get resgistrant attributes
qry = """
WITH unnested_cats AS (SELECT unnest(categories) AS cats
FROM ca_business)
SELECT cats, count(cats) as category_count
FROM unnested_cats
GROUP BY cats
ORDER BY category_count DESC;
"""


cur = conn.cursor()

# execute a statement
cur.execute(qry)


# display the PostgreSQL database server version
res = cur.fetchall()
# res = [c[0] for c in col_names]
       
# Close the communication with the PostgreSQL
cur.close()

# create category df
categories_df = pd.DataFrame(res, columns=["categories", "category_count"])
categories_df

Unnamed: 0,categories,category_count
0,Restaurant,38523
1,Fast food restaurant,15560
2,Mexican restaurant,13969
3,Auto repair shop,13163
4,Beauty salon,10758
...,...,...
4015,Regional council,1
4016,Residential area,1
4017,Sheepskin and wool products supplier,1
4018,Ladder supplier,1


In [6]:
# read in hdf files
hdf_list = read_categories_csv(hdf_p)
hdf_noise_list = read_categories_csv(hdf_noise_p)

# https://stackoverflow.com/a/37864171
m = (categories_df["categories"].str.contains('|'.join(hdf_list), case=False)) | (categories_df["categories"] == "Bar")
filtered_hpf = categories_df.loc[m]

# Remove some noisy hits from matching on bar
filtered_hpf = filtered_hpf[~filtered_hpf["categories"].str.contains('|'.join(hdf_noise_list), case=False)]

filtered_hpf["categories"].to_csv(resource_p / "exact_unhealthy_categories.csv", index=False)

filtered_hpf.head()

Unnamed: 0,categories,category_count
1,Fast food restaurant,15560
16,Bar,8184
18,Pizza restaurant,7898
21,Hamburger restaurant,7281
22,Convenience store,7107


In [7]:
# make total food list
# read in hdf files
all_food_list = read_categories_csv(all_food_p)
noise_list = read_categories_csv(noise_all_p)

# combine hdf list and all food list
all_food_list = all_food_list + hdf_list
all_food_list = list(set(all_food_list))

m = (categories_df["categories"].str.contains('|'.join(all_food_list), case=False)) | (categories_df["categories"] == "Bar")
filtered_food = categories_df.loc[m]

# Remove some noisy hits from matching on bar
filtered_food = filtered_food[~filtered_food["categories"].str.contains('|'.join(noise_list), case=False)]

filtered_food["categories"].to_csv(resource_p / "exact_food_categories.csv", index=False)

filtered_food

  m = (categories_df["categories"].str.contains('|'.join(all_food_list), case=False)) | (categories_df["categories"] == "Bar")


Unnamed: 0,categories,category_count
0,Restaurant,38523
1,Fast food restaurant,15560
2,Mexican restaurant,13969
5,Coffee shop,10487
7,Grocery store,10070
...,...,...
3952,Industrial supermarket,1
3962,Anago restaurant,1
3982,Chanko restaurant,1
3993,Alsace restaurant,1


In [8]:
# rewrite csv files
partial_df = pd.DataFrame(all_food_list)
partial_df.rename(columns={0: "categories"}, inplace=True)
partial_df.to_csv(resource_p / "partial_all_food_biz.csv", index=False, header=None)

noise_df = pd.DataFrame(noise_list)
noise_df.rename(columns={0: "categories"}, inplace=True)
noise_df.to_csv(resource_p / "noise_all_food_biz.csv", index=False, header=None)


partial_df = pd.DataFrame(hdf_list)
partial_df.rename(columns={0: "categories"}, inplace=True)
partial_df.to_csv(resource_p / "partial_unhealthy_food_biz.csv", index=False, header=None)

noise_df = pd.DataFrame(hdf_noise_list)
noise_df.rename(columns={0: "categories"}, inplace=True)
noise_df.to_csv(resource_p / "noise_unhealthy_food_biz.csv", index=False, header=None)

In [9]:
# create where clause for query
qry_where_list = [f"'{c}' = any(categories) OR " for c in filtered_hpf["categories"].tolist()]
# clean up last or
qry_where_list[-1] = qry_where_list[-1].replace(" OR ", "")
qry_where = "WHERE (" + ''.join(qry_where_list) + ")"

qry = "WITH store_names AS (SELECT DISTINCT name AS dist_names " \
      "FROM ca_business " \
      f"{qry_where}) " \
      "SELECT COUNT(*) " \
      "as count " \
      "from ca_business inner join store_names on ca_business.name = store_names.dist_names;"
qry

"WITH store_names AS (SELECT DISTINCT name AS dist_names FROM ca_business WHERE ('Fast food restaurant' = any(categories) OR 'Bar' = any(categories) OR 'Pizza restaurant' = any(categories) OR 'Hamburger restaurant' = any(categories) OR 'Convenience store' = any(categories) OR 'Liquor store' = any(categories) OR 'Ice cream shop' = any(categories) OR 'Dessert shop' = any(categories) OR 'Pizza delivery' = any(categories) OR 'Bar & grill' = any(categories) OR 'Pizza Takeout' = any(categories) OR 'Chicken wings restaurant' = any(categories) OR 'Donut shop' = any(categories) OR 'Cocktail bar' = any(categories) OR 'Wine bar' = any(categories) OR 'Wine store' = any(categories) OR 'Sports bar' = any(categories) OR 'Brewery' = any(categories) OR 'Beer store' = any(categories) OR 'Candy store' = any(categories) OR 'Dessert restaurant' = any(categories) OR 'Chocolate shop' = any(categories) OR 'Hot dog restaurant' = any(categories) OR 'Brewpub' = any(categories) OR 'Pastry shop' = any(categories) 

In [10]:
cur = conn.cursor()

# execute a statement
cur.execute(qry)


# display the PostgreSQL database server version
res = cur.fetchall()
# res = [c[0] for c in col_names]
       
# Close the communication with the PostgreSQL
cur.close()
res

[(70914,)]