In [1]:
import os
import numpy as np
import pandas as pd
import re
import praw
from datetime import datetime
import pickle
import datetime as dt
from psaw import PushshiftAPI
api = PushshiftAPI()

In [2]:
PRAW_CLIENT_ID = os.getenv('PRAW_CLIENT_ID')
PRAW_CLIENT_SECRET = os.getenv('PRAW_CLIENT_SECRET')
PRAW_USER_AGENT = os.getenv('PRAW_USER_AGENT')

reddit = praw.Reddit(client_id=PRAW_CLIENT_ID,
                     client_secret=PRAW_CLIENT_SECRET,
                     user_agent=PRAW_USER_AGENT)

In [3]:
# https://www.census.gov/geographies/reference-files/2016/demo/popest/2016-fips.html
# convert from .xlsx to .csv in Excel/Google Sheets
# Used to filter by geo level
df_geo = pd.read_csv('../data/sources/all-geocodes-v2016.csv', skiprows=4)

# https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-cities-and-towns.html
# Annual Estimates of the Resident Population for Incorporated Places of 50,000 or More, 
# Ranked by July 1, 2018 Population: April 1, 2010 to July 1, 2018 
# https://factfinder.census.gov/bkmk/table/1.0/en/PEP/2018/PEPANNRSIP.US12A
# Population
df_cities_pop = pd.read_csv('../data/sources/cities_pop.csv', skiprows=1)

In [4]:
df_geo.columns = [col.replace(' ', '_') for col in df_geo.columns]
df_cities_pop.columns = [col.replace(' ', '_') for col in df_cities_pop.columns]

In [5]:
df_cities_pop = df_cities_pop[['Target_Geo_Id2', 'Geography.2', 'April_1,_2010_-_Census', 'Population_Estimate_(as_of_July_1)_-_2018']]

#Target_Geo_Id2: 2679000 first 2 digits indicate state, last 5 digits indicate city
df_cities_pop = df_cities_pop.astype({'Target_Geo_Id2': str})
df_cities_pop[['state_fip', 'city_fip']] = df_cities_pop['Target_Geo_Id2'].str.extract(r'(\d{1,2})(\d{5})$', expand=True)
df_cities_pop = df_cities_pop.astype({'state_fip': int, 'city_fip': int})

df_cities = df_geo[(df_geo['Summary_Level'] == 162) & (df_geo['Area_Name_(including_legal/statistical_area_description)'].str.contains(' city'))]
df_cities = df_cities.rename({'Area_Name_(including_legal/statistical_area_description)': 'city'}, axis=1)

In [6]:
# strip city ending ie "Santa Cruz city"
regex_pat = re.compile(r' city$', flags=re.IGNORECASE)
df_cities['city_short'] = df_cities['city'].str.replace(regex_pat, '')

In [7]:
df = pd.merge(df_cities, df_cities_pop, left_on=['State_Code_(FIPS)', 'Place_Code_(FIPS)'], right_on=['state_fip', 'city_fip'])

In [8]:
# https://www.reddit.com/dev/api/#POST_api_search_subreddits

sub_city_dict = {}
for index, row in df.iterrows():
    city_short = row['city_short'].strip()
    state_city_id = row['Target_Geo_Id2'].strip()
    city_state = row['Geography.2']

    sub = reddit.post('api/search_subreddits', data={'query': city_short})
    if len(sub['subreddits']) > 0:
        if sub['subreddits'][0]['subscriber_count'] > 1000:
            sub_city_dict[state_city_id] = [
                                            city_short,
                                            city_state,
                                            sub['subreddits'][0]['name'].strip(),
                                            sub['subreddits'][0]['subscriber_count']
                                           ]

In [9]:
df_city_subs = pd.DataFrame.from_dict(sub_city_dict, orient='index').reset_index()
df_city_subs = df_city_subs.rename({'index': 'state_city_id', 0: 'city_short', 1: 'city_state', 2: 'city_sub', 3: 'sub_cnt'}, axis=1)

Note that the retrieved sub-reddits may not be valid so manually check later

In [10]:
pickle.dump(df_city_subs, open('../data/df_city_subs_raw.pkl', 'wb'))