### Scrape All ISP's

In [1]:
import pandas as pd
import gzip
import json
from pandas_geojson import to_geojson
from hughes_utils import get_hughes_offer_data
from xfinity_utils import get_xfinity_offer_data
from viastat_utils import get_viastat_offer_data
import os

In [2]:
# set notebook variables
CITY = 'Boston'
INPUT_CSV_PATH = "../data/open_address/processed/csv/city_of_boston-addresses-city.csv.gz"
# change to true for name of person scraping the data
JULIA = True
YASH = False
ABBY = False


In [3]:
# read in all data address data for specified city
city_df = pd.read_csv(INPUT_CSV_PATH, dtype='str')

# drop rows whose postcode is not given values
city_df.dropna(subset=['postcode'], axis=0, inplace=True)
# take one house number if multiple given
city_df['number'] = city_df.number.apply(lambda x: x.split('-')[0])
# cast lat and lon columns as str
city_df['lat'] = city_df['lat'].astype(float)
city_df['lon'] = city_df['lon'].astype(float)
city_df.head(5)

Unnamed: 0,hash,number,street,unit,city,district,region,postcode,id,incorporated_place,state,lon,lat,block_group,geoid
0,d2aec566d471e747,16,Creighton Street,,Boston,Suffolk County,,2130,,Boston city,MA,-71.10852,42.32296,1,250251207001
1,e3a1718dfa7ad40a,78,Train Street,1,Boston,Suffolk County,,2122,,Boston city,MA,-71.05414,42.29319,1,250251006011
2,175f2c0ed8d4b617,8,Garrison Street,408,Boston,Suffolk County,,2116,,Boston city,MA,-71.07964,42.34615,1,250250106001
3,7a0fb8b823b6d0d3,250,Commonwealth Avenue,12,Boston,Suffolk County,,2116,,Boston city,MA,-71.08285,42.35019,1,250250107011
4,042e012ebab63338,52,Columbia Road,Apt 8,Boston,Suffolk County,,2121,,Boston city,MA,-71.08279,42.30427,1,250250901001


In [4]:
if JULIA:
    scrape_addr_df = city_df[0:250]
    hughes_output_path = "../data/intermediary/scrape_isp/julia/hughes/"
    xfinity_output_path = "../data/intermediary/scrape_isp/julia/xfinity/"
    viastat_output_path = "../data/intermediary/scrape_isp/julia/viastat/"
    INDEX = 0
if YASH:
    scrape_addr_df = city_df[250:500]
    hughes_output_path = "../data/intermediary/scrape_isp/yash/hughes/"
    xfinity_output_path = "../data/intermediary/scrape_isp/yash/xfinity/"
    viastat_output_path = "../data/intermediary/scrape_isp/yash/viastat/"
    IDNEX = 250
if ABBY:
    scrape_addr_df = city_df[500:]
    hughes_output_path = "../data/intermediary/scrape_isp/abby/hughes/"
    xfinity_output_path = "../data/intermediary/scrape_isp/abby/xfinity/"
    viastat_output_path = "../data/intermediary/scrape_isp/abby/viastat/"
    INDEX = 500

if not os.path.exists(os.path.exists(hughes_output_path)):
    os.mkdir(os.path.exists(hughes_output_path))

if not os.path.exists(os.path.exists(xfinity_output_path)):
    os.mkdir(os.path.exists(xfinity_output_path))

if not os.path.exists(os.path.exists(viastat_output_path)):
    os.mkdir(os.path.exists(viastat_output_path))

Utils Functions

In [5]:
# transforms dataframe to geojson file
def df_to_geojson(df):    
    geojson_data = to_geojson(df=df, lat = 'lat', lon = 'lon', properties=['address_full', 'incorporated_place', 'state','collection_datetime', 'provider', 'speed_down', 'speed_up', 'speed_unit', 'price', 'technology', 'package', 'fastest_speed_down', 'fastest_speed_price', 'block_group', 'geoid'])
    return geojson_data

In [6]:
# saves geojson data to geojson.gz file
def compress_geojson(gejson_data, output_file):
   with gzip.open(output_file, 'wt', encoding='utf-8') as f:
        json.dump(gejson_data, f, ensure_ascii=False)

In [7]:
scrape_summary = {'hughes':{'success':0, 'fail':0}, 'xfinity':{'success':0, 'fail':0}, 'viastat':{'success':0, 'fail':0}}

for _, row in scrape_addr_df.iterrows():
    unit = row['unit'] if not pd.isna(row['unit']) else ''
    street = row['street']
    number = row['number']
    streetType = street.split()[-1]
    streetName = " ".join(word for word in street.split()[:-1] if word not in streetType)
    city = row['city']
    zipcode = row['postcode']
    state = row['state']
    lat = row['lat']
    lon = row['lon']
    block_group = row['block_group']
    geoid = row['geoid']
    print(INDEX)
    if INDEX % 5 == 0:
        verbose = True
    else:
        verbose = False

    try:
        hughes_offer = get_hughes_offer_data(house_number= number, street_name = streetName, street_type=streetType, city=city, state=state, zip_code= str(zipcode), lat = lat, long = lon)
        hughes_df = pd.DataFrame(hughes_offer, index=[0])
        hughes_df['block_group'] = block_group
        hughes_df['geoid'] = geoid
        hughes_geojson = df_to_geojson(hughes_df)
        compress_geojson(hughes_geojson, hughes_output_path+str(INDEX)+'.geojson.gz')
        scrape_summary['hughes']['success'] += 1
        print('\thughes success')
        if verbose:
            print(hughes_df)
    except:
        scrape_summary['hughes']['fail'] += 1
        print('\thughes fail')
    
    try:
        xfinity_offer = get_xfinity_offer_data(house_number= number, street_name = streetName, street_type=streetType, city=city, state=state, zip_code= zipcode, lat = lat, long = lon)
        xfinity_df = pd.DataFrame(data=xfinity_offer, index=[0])
        xfinity_df['block_group'] = block_group
        xfinity_df['geoid'] = geoid
        xfinity_geojson = df_to_geojson(xfinity_df)
        compress_geojson(xfinity_geojson, xfinity_output_path+str(INDEX)+'.geojson.gz')
        scrape_summary['xfinity']['success'] += 1
        print('\txfinity success')
        if verbose:
            print(xfinity_df)
    except:
        scrape_summary['xfinity']['fail'] += 1
        print('\txfinity fail')
    
    try:
        viastat_offer = get_viastat_offer_data(unit, number+' '+streetName+' '+streetType, city, state, zipcode, lat, lon)
        viastat_df = pd.DataFrame(data=viastat_offer, index=[0])
        viastat_df['block_group'] = block_group
        viastat_df['geoid'] = geoid
        viastat_geojson = df_to_geojson(viastat_df)
        compress_geojson(viastat_geojson, viastat_output_path+str(INDEX)+'.geojson.gz')
        scrape_summary['viastat']['success'] += 1
        print('\tviastat success')
        if verbose:
            print(viastat_df)
    except:
        scrape_summary['viastat']['fail'] += 1
        print('\tviastat fail')
        
    INDEX += 1

print(scrape_summary)

1
	hughes success
	xfinity fail
	viastat success
2
	hughes success
	xfinity success
	viastat success
3
	hughes success
	xfinity fail
	viastat success
4
	hughes success
	xfinity fail
	viastat success
5
	hughes success
                           address_full incorporated_place state       lat  \
0  52, Columbia Road, Boston, MA, 02121             Boston    MA  42.30427   

        lon  collection_datetime provider speed_down  speed_up speed_unit  \
0 -71.08279         1.701822e+09   hughes         15         3       Mbps   

   price technology package fastest_speed_down fastest_speed_price  \
0  49.99  Satellite   15 GB                 50              149.99   

  block_group         geoid  
0           1  250250901001  
	xfinity fail
	viastat success
                                 address_full incorporated_place state  \
0  Apt 8, 52 Columbia Road, Boston, MA, 02121             Boston    MA   

        lat       lon  collection_datetime provider speed_down       speed_up  \
0  42.304

In [9]:
print(scrape_summary)

{'hughes': {'success': 250, 'fail': 0}, 'xfinity': {'success': 89, 'fail': 161}, 'viastat': {'success': 95, 'fail': 155}}
