### Scrape All ISP's

In [84]:
import pandas as pd

from hughes_utils import get_hughes_offer_data
from xfinity_utils import get_xfinity_offer_data
from viastat_utils import get_viastat_offer_data

In [85]:
# read in all data address data from each state
ct_df = pd.read_csv("city_of_hartford-addresses-city.csv.gz")
#de_df = pd.read_csv("../data/open_address/processed/city_of_dover-addresses-city.csv.gz")
# TODO: load in data for rest of cities

!pip install geopandas

In [9]:
ct_df

Unnamed: 0,id,unit,number,street,city,district,region,postcode,hash,geometry,incorporated_place,state,lon,lat,temp
0,,,1,CIVIC CENTER PLZ,Hartford,,,,3d6b7182020b27b1,POINT (-72.6771775 41.7684543),Hartford,CT,-72.677177,41.768454,"['', '']"
1,,,1,CIVIC CENTER PLZ,Hartford,,,,7d7e3e812f9143de,POINT (-72.6772222 41.7683543),Hartford,CT,-72.677222,41.768354,"['', '']"
2,,,80,STATE HOUSE SQ,Hartford,,,,8e0c8d2fd7daf0b4,POINT (-72.6720599 41.7663704),Hartford,CT,-72.672060,41.766370,"['', '']"
3,,,1,CIVIC CENTER PLZ,Hartford,,,,4a96226ee988b5ff,POINT (-72.6769559 41.7676351),Hartford,CT,-72.676956,41.767635,"['', '']"
4,,,30,STATE HOUSE SQ,Hartford,,,,07ff199dc68a5d04,POINT (-72.6725869 41.7673181),Hartford,CT,-72.672587,41.767318,"['', '']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,,,16,DOUGLAS ST,Hartford,,,,339bc7e6bc770185,POINT (-72.6783965 41.7384981),Hartford,CT,-72.678397,41.738498,"['', '']"
196,,,129,BARKER ST,Hartford,,,,23fd84282b5ca0f7,POINT (-72.6785946 41.7424828),Hartford,CT,-72.678595,41.742483,"['', '']"
197,,,19,DOUGLAS ST,Hartford,,,,ab47b0582261df87,POINT (-72.6784929 41.7382141),Hartford,CT,-72.678493,41.738214,"['', '']"
198,,,112,GILMAN ST,Hartford,,,,d1896f6d6cf95470,POINT (-72.6823262 41.7366613),Hartford,CT,-72.682326,41.736661,"['', '']"


In [None]:
def scrape_all(df):
    """
    Takes in a df containing addresses to output the offers from different ISPs.
    
    Parameters:
    df : Dataframe - The df containing the addresses
    
    Returns:
    results : list - The list containing the offers for each address
    """
    results = []
    
    for index, row in df.iterrows():
        street = row['street']
        number = row['number']
        streetType = street.split()[-1]
        streetName = " ".join(word for word in street.split()[:-1] if word not in streetType)
        city = row['city']
        zipcode = row['postcode']
        state = row['state']
        lat = row['lat']
        lon = row['lon']
        
        offers = scrape_isp(number, streetName, streetType, city, zipcode, state, lat, lon)
        results.append(offers)
    
    return results


In [89]:
def scrape_isp(number, streetName, streetType, city, zipcode, state, lat, lon):
    """
    Scrapes the offers of each ISP.
    
    Parameters:
    number : str - The building number
    streetName : str - The name of the street
    streetType : str - The type of street (eg. Ave, Sq, St)
    city : str - The name of the city
    state : str - The abbreviation of the state. (eg. MA, CA)
    lat : float - the latitude 
    lon : float - the longitude 
    
    Returns:
    offers : dictionary - The dictionary containing the offers for the address
    """
    offers = {}
    #offers['Hughes'] = get_hughes_offer_data(house_number= number, street_name = streetName, street_type=streetType, city=city, state=state, zip_code= zipcode, lat = lat, long = lon)
    offers['Xfinity'] = get_xfinity_offer_data(house_number= number, street_name = streetName, street_type=streetType, city=city, state=state, zip_code= zipcode, lat = lat, long = lon)
    offers['Viasat'] = get_viastat_offer_data("0", f'{number} {streetName} {streetType}', city, state, zipcode, lat, lon)
    return offers

In [92]:
scrape_isp('1274', 'Beacon', 'Street', 'Waban', 'MA', '02468', 42.330750, -71.215408)

{'Xfinity': {'address_full': '1274, Beacon Street, Waban, 02468, MA',
  'incorporated_place': 'Waban',
  'state': '02468',
  'lat': 42.33075,
  'lon': -71.215408,
  'collection_datetime': 1701641332.217844,
  'provider': 'xfinity',
  'speed_down': '400',
  'speed_up': '10',
  'speed_unit': 'Mbps',
  'price': '102.00',
  'technology': 'Cable',
  'package': 'Fast',
  'fastest_speed_down': 2000,
  'fastest_speed_price': '130.00'},
 'Viasat': {'address_full': '0, 1274 Beacon Street, Waban, 02468, MA',
  'incorporated_place': 'Waban',
  'state': '02468',
  'lat': 42.33075,
  'lon': -71.215408,
  'collection_datetime': 1701641337.4985468,
  'provider': 'Viasat',
  'speed_down': '25',
  'speed_up': 'Not specified',
  'speed_unit': 'Mbps',
  'price': 69.99,
  'technology': 'Satellite',
  'package': 'Choice 25 Mbps/60 GB',
  'fastest_speed_down': '50',
  'fastest_speed_price': 299.99}}

In [91]:
import requests
import time

def get_viastat_offer_data(unit: str, street: str, city: str, state: str, zip_code: str, lat: float, lon: float):
    """
    Takes in the address and returns the data of the offer provided by Verisat.
    
    Parameters:
    unit : str - The unit of your apartment / building. (eg. 530)
    street : str - The address line containing the name of your street and building number. (eg. 220 Huntington Ave)
    city : str - The city that the street belongs to. (eg. Boston)
    state : str - The state abbreviation where the street and address is in. (eg. MA, CA, IA)
    zip_code : str - The zipcode of the address. (eg. 02120)
    
    Returns:
    dataDict : dictionary - The dictionary that contains all of the data.
    """

    headers = {
        'authority': 'buy.viasat.com',
        'accept': '*/*',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
        'authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmbG93SWQiOiI1OGVmNDBkMi1lYWI1LTQ4ZTktYjdlOS05NzJlMjQwNWU2NjAiLCJsYW5ndWFnZSI6ImVuIiwiY291bnRyeSI6IlVTIiwiZW52IjoicHJvZCIsImN1c3RvbWVyVHlwZSI6InJlc2lkZW50aWFsIiwiaWF0IjoxNzAxNjQxMjY3LCJleHAiOjE3MDE2NDQ4Njd9.cVPse8M756oCpyTczoq4WYxKbokYo4NIiGwpYkkOFPM',
        'content-type': 'application/json',
        'origin': 'https://buy.viasat.com',
        'referer': 'https://buy.viasat.com/en-US/r/pln',
        'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
    }

    json_data = {
        'operationName': 'getAvailableProducts',
        'variables': {
            'input': {
                'location': {
                    'address': {
                        'addressLines': [
                            street,
                            unit,
                        ],
                        'municipality': city,
                        'region': state,
                        'postalCode': zip_code,
                        'countryCode': 'US',
                    },
                    'coordinates': {
                        'longitude': lon,
                        'latitude': lat,
                    },
                },
                'salesAgreementId': '3f9e0dd5-612a-49f6-9e22-15bebfbeb175',
                'productSegment': 'RESIDENTIAL',
            },
        },
        'query': 'query getAvailableProducts($input: GetAvailableProductsInputR0) {\n  getAvailableProducts(input: $input) {\n    id\n    name\n    characteristics {\n      dataCap\n      dataCapUnits\n      uploadSpeed\n      uploadUnits\n      downloadSpeed\n      downloadUnits\n      displayOrder\n      freeZone\n      resolution\n      productFamily\n      dataAllowanceText\n      textGeneral\n      textGeneral01\n      inflectionPointText\n      bannerColor\n      routerText\n      shortName\n      benefits\n      attribute1\n      attribute2\n      titleAddOns\n      serviceType\n      tag\n      imageOneUrl\n      isRegulated\n      contractTerm\n      contractTermUnit\n      feeText\n      downloadRange\n      uploadSpeedText\n      typicalDownloadSpeed\n      __typename\n    }\n    offerId\n    price\n    extensionTypes\n    promo {\n      price\n      duration\n      __typename\n    }\n    bestFor\n    isCafII\n    totalDiscount {\n      price\n      duration\n      __typename\n    }\n    digitalServices {\n      iconUrl\n      __typename\n    }\n    __typename\n  }\n}\n',
    }

    response = requests.post('https://buy.viasat.com/graphql', cookies={}, headers=headers, json=json_data)

    data = response.json()

    dataDict = {}
    dataDict['address_full'] = f'{unit}, {street}, {city}, {state}, {zip_code}'
    dataDict['incorporated_place'] = city
    dataDict['state'] = state
    dataDict['lat'] = lat
    dataDict['lon'] = lon
    dataDict['collection_datetime'] = time.time()
    dataDict['provider'] = 'Viasat'
    
    plan_name = data['data']['getAvailableProducts'][0]['name'].split()
    slow_down_speed = plan_name[1]
    
    fast_down_speed = data['data']['getAvailableProducts'][4]['name'].split()[1]
    
    dataDict['speed_down'] = slow_down_speed
    dataDict['speed_up'] = 'Not specified'
    dataDict['speed_unit'] = 'Mbps'
    dataDict['price'] = data['data']['getAvailableProducts'][0]['price']
    dataDict['technology'] = 'Satellite'
    dataDict['package'] = data['data']['getAvailableProducts'][0]['name']
    dataDict['fastest_speed_down'] = fast_down_speed
    dataDict['fastest_speed_price'] = data['data']['getAvailableProducts'][4]['price']

    return dataDict


In [79]:
import requests
import regex as re
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

def get_fastest_speed_down_price(plans):
    names = []
    prices = []
    down_speeds = []

    for plan  in plans:

        name = plan.find(class_='plan-and-pricing-item__plan-data').text.strip()
        price = plan.find( class_='plan-and-pricing-item__monthly_price').text
        price = re.search('[0-9.]+', price).group()

        down_speed = plan.find('strong').text.split()[0]
        
        try:
            down_speed = eval(down_speed)
        except:
            down_speed = np.NaN
        
        prices.append(price)
        names.append(name)
        down_speeds.append(down_speed)
    
    ind = np.argmax([x for x in down_speeds if not pd.isna(x)])
    fastest_speed_down = down_speeds[ind]
    fastest_speed_price = prices[ind]
    
    return fastest_speed_down, fastest_speed_price

def get_hughes_offer_dataa(house_number: str, street_name: str, street_type:str, city: str, state: str, zip_code: str, lat: float, long: float):
    """
    Gets the response from https://www.hughesnet.com for searching for an offer at the given address
    in the United States.
    Parameters:
        house_number - str house number for the address
        street_name - str street name (i.e. Huntington)
        street_type - str type of street (i.e. Avenue, Boulevard, etc.)
        city - str name of city
        state - str state abberviation (i.e. MA)
        zip_code - str zip code
        lat - float latitude of the address
        long - float longitude for the address
    Return:
    json offer found for the given address
    """
    
    # replace the space with a "+" in the city name
    if ' ' in city:
        city = city.replace(' ', '+')
    # https://www.hughesnet.com
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Language': 'en-US,en;q=0.5',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'X-Requested-With': 'XMLHttpRequest',
        'Origin': 'https://www.hughesnet.com',
        'Connection': 'keep-alive',
        'Referer': 'https://www.hughesnet.com/get-started',
        'Sec-Fetch-Dest': 'empty',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Site': 'same-origin',
    }
    params = {
        'ajax_form': '1',
        '_wrapper_format': 'drupal_ajax',
    }
    data = 'autocomplete={0}+{1}+{2}^%^2C+{3}^%^2C+{4}^%^2C+USA&street={0}&route={1}+{2}&city={3}&state={4}&zip={5}&country=United+States&lat={6}&lng={7}&administrative_area_level_3=&plan_type=Residential&epq_backup_street_address=&epq_backup_city=&epq_backup_state=&epq_backup_zip_code=&form_build_id=form-ZeKWKj-Yp0BAJ2jKqgybxmXAikOkANpiLHqPRGxS9Ug&form_id=epq_lookup_form&email=&_triggering_element_name=epq_lookup&_triggering_element_value=Find+Plans&_drupal_ajax=1&ajax_page_state^%^5Btheme^%^5D=hughes_na_theme&ajax_page_state^%^5Btheme_token^%^5D=&ajax_page_state^%^5Blibraries^%^5D=blazy^%^2Fload^%^2Cclassy^%^2Fbase^%^2Cclassy^%^2Fmessages^%^2Cclassy^%^2Fnode^%^2Ccore^%^2Finternal.jquery.form^%^2Ccore^%^2Fnormalize^%^2Cgeolocation_google_maps^%^2Fgoogle^%^2Chughes_form_epq^%^2Fepq-form-tfn^%^2Chughes_form_epq^%^2Fepq-lookup-library^%^2Chughes_na_theme^%^2Fglobal-css^%^2Chughes_na_theme^%^2Fglobal-js^%^2Chughes_utility^%^2Fhughes_utility.language^%^2Chughes_utility^%^2Fhughes_utility.tracking^%^2Chughesnet_na_base_theme^%^2Fcard-display-epq^%^2Chughesnet_na_base_theme^%^2Fcountry-selector^%^2Chughesnet_na_base_theme^%^2Fcountry_selector^%^2Chughesnet_na_base_theme^%^2Fcritical-general-css^%^2Chughesnet_na_base_theme^%^2Fepq-set-height^%^2Chughesnet_na_base_theme^%^2Fepq_lookup_block^%^2Chughesnet_na_base_theme^%^2Fexpandable-listing^%^2Chughesnet_na_base_theme^%^2Fform-label-animation^%^2Chughesnet_na_base_theme^%^2Fform-validation-message^%^2Chughesnet_na_base_theme^%^2Fglobal-offer-banner^%^2Chughesnet_na_base_theme^%^2Fhomepage-campaign-landing-common^%^2Chughesnet_na_base_theme^%^2Finit-tool-tip^%^2Chughesnet_na_base_theme^%^2Fload-current-date^%^2Chughesnet_na_base_theme^%^2Fmain-menu-pr-mx^%^2Chughesnet_na_base_theme^%^2Fmatch-height^%^2Chughesnet_na_base_theme^%^2Fmenu-footer-padding^%^2Chughesnet_na_base_theme^%^2Fmove-add-to-any^%^2Chughesnet_na_base_theme^%^2Fnodes^%^2Chughesnet_na_base_theme^%^2Fonecol_layout^%^2Chughesnet_na_base_theme^%^2Fparagraphs^%^2Chughesnet_na_base_theme^%^2Fpartial-css^%^2Chughesnet_na_base_theme^%^2Fplan-and-pricing-mobile-carousel^%^2Chughesnet_na_base_theme^%^2Fredirect-pr^%^2Chughesnet_na_base_theme^%^2Fshow-popup-content-link^%^2Chughesnet_na_base_theme^%^2Fsticky_cta_bar^%^2Chughesnet_na_base_theme^%^2Ftext^%^2Chughesnet_na_base_theme^%^2Ftoggle-whatsapp-hours^%^2Chughesnet_na_base_theme^%^2Ftwo-column-content^%^2Chughesnet_na_base_theme^%^2Futility_navigation^%^2Cparagraphs^%^2Fdrupal.paragraphs.unpublished^%^2Csmart_content_datalayer^%^2Fdatalayer_push^%^2Csystem^%^2Fbase'.format(house_number, street_name, street_type, city, state, zip_code, lat, long)
    response = requests.post('https://www.hughesnet.com/get-started', params=params, cookies={}, headers=headers, data=data)
    offer = response.json()
    
    upload_speed_dict = {'15 GB':3, '50 GB':3, 'Fusion 100GB':3, 'Fusion 200GB':5}
    fastest_down_speed ,fastest_speed_price = get_fastest_speed_down_price(BeautifulSoup(offer[12]['data']).find_all(class_='plan-and-pricing-item'))
    cheapest_plan = BeautifulSoup(offer[12]['data']).find(class_='plan-and-pricing-item')
    package = cheapest_plan.find(class_='plan-and-pricing-item__plan-data').text.strip()
    price = cheapest_plan.find( class_='plan-and-pricing-item__monthly_price').text
    price = re.search('[0-9.]+', price).group()
    download_details = cheapest_plan.find('strong').text.split()
    speed_down = download_details[0]
    speed_unit = download_details[1]
    speed_up = upload_speed_dict[package]

    data_dict ={}
    data_dict['address_full'] = '{0}, {1} {2}, {3}, {4}, {5}'.format(house_number, street_name, street_type, city, state, zip_code)
    data_dict['incorporated_place'] = city
    data_dict['state'] = state
    data_dict['lat'] = lat
    data_dict['lon'] = long
    data_dict['collection_datetime'] = time.time()
    data_dict['provider'] = 'hughes'
    data_dict['speed_down'] = speed_down
    data_dict['speed_up'] = speed_up
    data_dict['speed_unit'] = speed_unit
    data_dict['price'] = price
    data_dict['technology'] = 'Satellite'
    data_dict['package'] = package
    data_dict['fastest_speed_down'] = fastest_down_speed
    data_dict['fastest_speed_price'] = fastest_speed_price

    return data_dict
