# Parsing and Processing Lookup Responses

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import gzip
import json
from datetime import datetime

import multiprocess
import numpy as np
from tqdm import tqdm
import pandas as pd

from config import (
    inc_city_att, 
    inc_city_cl, 
    inc_city_verizon, 
    inc_city_el,
    inc_city_hughes
)
from parsers import (
    cl_workflow, 
    att_workflow, 
    verizon_workflow, 
    el_workflow, 
    hughes_workflow,
    get_incorporated_places, 
    check_redlining, 
    get_holc_grade, 
    get_closest_fiber
)

In [3]:
# inputs
fn_acs = '../data/intermediary/census/aggregated_tables_plus_features.csv.gz'
pattern_hughes = '../data/intermediary/isp/hughes/*/*.geojson.gz' # pattern for all data collected from lookup tools
pattern_xfinity = '../data/intermediary/isp/xfinity/*/*.geojson.gz'
pattern_viastat =  "../data/intermediary/isp/viastat/*/*.geojson.gz"

# outputs
fn_hughes = "../data/output/speed_price_hughes.csv.gz"
fn_xfinity = '../data/output/speed_price_xfinity.csv.gz'
fn_viastat = '../data/output/speed_price_viastat.csv.gz'

# params
n_jobs = 20
recalculate = False

In [4]:
# This is from Census data we crunched in the previous notebook.
acs = pd.read_csv(fn_acs, dtype={'geoid': str, 'block_group': str})

# These are the columns we're going to bring to merge with lookup responses.
acs_cols = [
    'geoid', 'race_perc_non_white','income_lmi', 
    'ppl_per_sq_mile', 'n_providers', 'income_dollars_below_median',
    'internet_perc_broadband', 'median_household_income'
]

## Total data collected

In [5]:
def count_addresses(fn):
    """
    How many addresses did we successfully collect in each file?
    """
    import gzip
    import json
    count = 0
    with gzip.open(fn, 'rb') as f:
        for line in f.readlines():
            record = json.loads(line)
            count += 1
    return count 

def count_successful_addresses(pattern, n_jobs=20):
    """
    For all files in `pattern`, sees how many addresses were successfully counted.
    Uses multiprocessing to speed things up.
    """
    files = glob.glob(pattern)
    count = 0
    with multiprocess.get_context("spawn").Pool(n_jobs) as pool:
        for _count in tqdm(pool.imap_unordered(count_addresses, files), 
                           total=len(files)):
            count += _count
    return count

In [6]:
hughes_count = count_successful_addresses(pattern_hughes, n_jobs=n_jobs)
xfinity_count = count_successful_addresses(pattern_xfinity, n_jobs=n_jobs)
viastat_count = count_successful_addresses(pattern_viastat, n_jobs=n_jobs)
all_records = hughes_count + xfinity_count + viastat_count

print(f"""Hughes Net: {hughes_count}
Xfinity: {xfinity_count}
ViaStat: {viastat_count}
Total: {all_records}""")

100%|██████████| 14/14 [00:00<00:00, 25.95it/s]
100%|██████████| 4/4 [00:00<00:00, 26.47it/s]
0it [00:00, ?it/s]

Hughes Net: 14
Xfinity: 4
ViaStat: 0
Total: 18





## Functions we're going to be using

We `check_redlining` grades by looking if an addresses' coordinates (converted to a Shapely `Point`) are within the `Polygon`s of redlining maps by Mapping Inequality. This actual check is done by `get_holc_grade`.

In [7]:
??get_holc_grade

[1;31mSignature:[0m [0mget_holc_grade[0m[1;33m([0m[0mrow[0m[1;33m:[0m [0mdict[0m[1;33m,[0m [0mpolygons[0m[1;33m:[0m [0mlist[0m[1;33m)[0m [1;33m->[0m [0mstr[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;32mdef[0m [0mget_holc_grade[0m[1;33m([0m[0mrow[0m[1;33m:[0m [0mdict[0m[1;33m,[0m [1;33m
[0m                   [0mpolygons[0m[1;33m:[0m [0mlist[0m[1;33m)[0m [1;33m->[0m [0mstr[0m[1;33m:[0m[1;33m
[0m    [1;34m"""
    Converts any lat and lon in a dictionary into a shapely point,
    then iterate through a list of dictionaries containing 
    shapely polygons shapes for each HOLC-graded area.
    """[0m[1;33m
[0m    [0mpoint[0m [1;33m=[0m [0mPoint[0m[1;33m([0m[0mfloat[0m[1;33m([0m[0mrow[0m[1;33m[[0m[1;34m'lon'[0m[1;33m][0m[1;33m)[0m[1;33m,[0m [0mfloat[0m[1;33m([0m[0mrow[0m[1;33m[[0m[1;34m'lat'[0m[1;33m][0m[1;33m)[0m[1;33m)[0m[1;33m
[0m    [1;32mfor[0m [0mpolygon[0m [1;32min[

## Hughes Net

In [8]:
states = []

In [9]:
# debugging

# if not os.path.exists(fn_hughes) or recalculate:
#     # find the data we collected for each block group.
#     data_hughes = []
#     files = glob.glob(pattern_hughes)[0:1]
#     with multiprocess.Pool(n_jobs) as pool:
#         # create parallel jobs that parse each block group of data using `hughes_workflow`.
#         for record in tqdm(pool.imap_unordered(hughes_workflow, files), 
#                            total=len(files)):
#             res = record
    

In [11]:
if not os.path.exists(fn_hughes) or recalculate:
    # find the data we collected for each block group.
    data_hughes = []
    files = glob.glob(pattern_hughes)
    with multiprocess.Pool(n_jobs) as pool:
        # create parallel jobs that parse each block group of data using `hughes_workflow`.
        for record in tqdm(pool.imap_unordered(hughes_workflow, files), 
                           total=len(files)):
            data_hughes.extend(record)
    hughes = pd.DataFrame(data_hughes)
    del data_hughes
    
    
    hughes['block_group'] = hughes['block_group'].apply(lambda x: f"{int(x):012d}")
    
    # check HOLC-grades for each address, and the distance to download speeds at or above 200 Mbps
    hughes = check_redlining(hughes)
    # merge census data, and save the file
    hughes_acs = hughes.merge(acs[acs_cols], how='left',
                        left_on='block_group', right_on='geoid')
    hughes_acs = hughes_acs[[c for c in hughes_acs.columns if c != 'geoid']]
    hughes_acs.to_csv(fn_hughes, index=False, compression='gzip')
else:
    hughes_acs = pd.read_csv(fn_hughes)

100%|██████████| 14/14 [00:09<00:00,  1.52it/s]
100%|██████████| 1/1 [00:00<00:00, 25.31it/s]


In [12]:
# start and end collection datetime
[datetime.fromtimestamp(hughes_acs.collection_datetime.min()), 
 datetime.fromtimestamp(hughes_acs.collection_datetime.max())]

[datetime.datetime(2023, 12, 4, 14, 2, 7, 492361),
 datetime.datetime(2023, 12, 4, 14, 4, 8, 780826)]

In [13]:
hughes_acs['state'].nunique()

1

In [14]:
len(hughes_acs)

14

In [15]:
states.extend(hughes_acs['state'].unique())

In [16]:
hughes_acs.redlining_grade.value_counts(normalize=True)

redlining_grade
C    0.583333
D    0.333333
B    0.083333
Name: proportion, dtype: float64