# Web scraping: headers, the networks tab and parsing an API URL
## Helpful links and resources
- [urllib](https://docs.python.org/3/library/urllib.parse.html#) is a Python library that will pick apart URLs
- [Sessions object - request library](https://docs.python-requests.org/en/master/user/advanced/#session-objects)

In [3]:
#import libraries
import pandas as pd
import requests
import json
import urllib
from urllib.parse import urlparse

## The networks tab
### Static data files
[Covid cases in the US - New York Times](https://www.nytimes.com/interactive/2021/us/covid-cases.html)

In [4]:
# get static data file
covid_cases_r = requests.get('https://static01.nyt.com/newsgraphics/2021/coronavirus-tracking/data/pages/usa/data.json')

In [5]:
covid_cases = covid_cases_r.json()

In [6]:
# covid_cases
covid_cases.keys()


dict_keys(['updated', 'updated_datetime', 'location', 'counties', 'states', 'clusters', 'page_notes', 'headline_override'])

In [7]:
len(covid_cases['states'])

56

In [8]:
covid_cases['states'][0]

{'country': 'United States',
 'display_name': 'Alabama',
 'nyt_abbr': 'Ala.',
 'geoid': 'USA-01',
 'href': 'https://www.nytimes.com/interactive/2021/us/alabama-covid-cases.html',
 'latest': {'total': {'cases': 551298, 'deaths': 11358},
  'average': {'cases': 121,
   'deaths': 2.857142857142857,
   'hospitalized': 274.6666666666667,
   'tests': 4143.571428571428},
  'vaccination': {'date': '2021-07-02',
   'people_vaccinated_pct_of_pop': 40.03422,
   'people_vaccinated_pct_of_pop_display': '40%',
   'people_fully_vaccinated_pct_of_pop': 32.67309,
   'people_fully_vaccinated_pct_of_pop_display': '33%',
   'people_vaccinated_12plus_pct_of_pop': 46.9,
   'people_vaccinated_12plus_pct_of_pop_display': '47%',
   'people_vaccinated_18plus_pct_of_pop': 50.2,
   'people_vaccinated_18plus_pct_of_pop_display': '50%',
   'people_vaccinated_65plus_pct_of_pop': 78.9,
   'people_vaccinated_65plus_pct_of_pop_display': '79%',
   'people_fully_vaccinated_12plus_pct_of_pop': 38.3,
   'people_fully_vaccin

In [9]:
covid_cases_df = pd.DataFrame(covid_cases['states'])

In [10]:
covid_cases_df.head(5)

Unnamed: 0,country,display_name,nyt_abbr,geoid,href,latest,long_name,percent_change_14day,population,hospital_area_population,region,region_type,slug,state,subregion,date,population_adjustment
0,United States,Alabama,Ala.,USA-01,https://www.nytimes.com/interactive/2021/us/al...,"{'total': {'cases': 551298, 'deaths': 11358}, ...",Alabama,"{'raw': {'cases': -25.701754385964914, 'deaths...",4903185,4903185,Alabama,state,us/alabama-covid-cases,"{'us_state_fips': '01', 'name': 'Alabama', 'sh...",,2021-07-05,
1,United States,Alaska,Alaska,USA-02,https://www.nytimes.com/interactive/2021/us/al...,"{'total': {'cases': 70774, 'deaths': 360}, 'av...",Alaska,"{'raw': {'cases': -19.58333333333333, 'deaths'...",731545,731545,Alaska,state,us/alaska-covid-cases,"{'us_state_fips': '02', 'name': 'Alaska', 'sho...",,2021-07-05,
2,United States,Arizona,Ariz.,USA-04,https://www.nytimes.com/interactive/2021/us/ar...,"{'total': {'cases': 897010, 'deaths': 17979}, ...",Arizona,"{'raw': {'cases': 16.357504215851606, 'deaths'...",7278717,7278717,Arizona,state,us/arizona-covid-cases,"{'us_state_fips': '04', 'name': 'Arizona', 'sh...",,2021-07-05,
3,United States,Arkansas,Ark.,USA-05,https://www.nytimes.com/interactive/2021/us/ar...,"{'total': {'cases': 351825, 'deaths': 5920}, '...",Arkansas,"{'raw': {'cases': 121.43734643734643, 'deaths'...",3017804,3017804,Arkansas,state,us/arkansas-covid-cases,"{'us_state_fips': '05', 'name': 'Arkansas', 's...",,2021-07-05,
4,United States,California,Calif.,USA-06,https://www.nytimes.com/interactive/2021/us/ca...,"{'total': {'cases': 3822073, 'deaths': 63256},...",California,"{'raw': {'cases': 7.66069331699597, 'deaths': ...",39512223,39512223,California,state,us/california-covid-cases,"{'us_state_fips': '06', 'name': 'California', ...",,2021-07-05,


In [11]:
#Washington Post USPS data
#https://www.washingtonpost.com/business/interactive/2021/dejoy-usps-delays-by-zip-code-map/?utm_source=twitter&utm_campaign=wp_graphics&utm_medium=social

df = pd.read_csv('https://www.washingtonpost.com/business/interactive/2021/dejoy-usps-delays-by-zip-code-map/usps.csv')

In [14]:
df.head(5)

Unnamed: 0,zip3,avg_days_cur,avg_days_new,avg_days_diff
0,5,2.665517,3.103653,-0.438135
1,10,2.420418,2.778212,-0.357794
2,11,2.411635,2.745268,-0.333633
3,12,2.439943,2.809153,-0.36921
4,13,2.422443,2.835077,-0.412634


### "Secret" APIs
Shopping websites are good candidates for secret APIs, such as [Target](www.target.com)

#### Target's Search API

In [15]:
target_search_url = 'https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1?key=ff457966e64d5e877fdbad070f276d18ecec4a01&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&keyword=paper+plates&offset=0&page=%2Fs%2Fpaper+plates&platform=desktop&pricing_store_id=2213&scheduled_delivery_store_id=2213&store_ids=2213%2C3333%2C1249%2C1255%2C1289&useragent=Mozilla%2F5.0+%28X11%3B+CrOS+x86_64+13904.66.0%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36&visitor_id=017A794441090201900B029CA7100EB3'

In [None]:
#url comes from Network > XHR > Headers > Request

In [184]:
# search for an item with the networks tab open to ID which APIs you can use

In [16]:
# parse the URL so it's easier to read
parsed_url = urlparse(target_search_url)

In [17]:
# check the parsed URL, this is a tuple
parsed_url

ParseResult(scheme='https', netloc='redsky.target.com', path='/redsky_aggregations/v1/web/plp_search_v1', params='', query='key=ff457966e64d5e877fdbad070f276d18ecec4a01&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&keyword=paper+plates&offset=0&page=%2Fs%2Fpaper+plates&platform=desktop&pricing_store_id=2213&scheduled_delivery_store_id=2213&store_ids=2213%2C3333%2C1249%2C1255%2C1289&useragent=Mozilla%2F5.0+%28X11%3B+CrOS+x86_64+13904.66.0%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36&visitor_id=017A794441090201900B029CA7100EB3', fragment='')

In [18]:
# format the endpoint and parameters, structure the endpoint first
parsed_url[0]

'https'

In [19]:
parsed_url[1]

'redsky.target.com'

In [20]:
target_search_endpoint = parsed_url[0] + "://" + parsed_url[1] + parsed_url[2]

In [21]:
target_search_endpoint

'https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v1'

In [22]:
# change something in the parameters (like keyword), key value pairs are separated by an &, use split method to get an arrary
#this creates a list
parsed_url[4].split('&')

['key=ff457966e64d5e877fdbad070f276d18ecec4a01',
 'channel=WEB',
 'count=24',
 'default_purchasability_filter=true',
 'include_sponsored=true',
 'keyword=paper+plates',
 'offset=0',
 'page=%2Fs%2Fpaper+plates',
 'platform=desktop',
 'pricing_store_id=2213',
 'scheduled_delivery_store_id=2213',
 'store_ids=2213%2C3333%2C1249%2C1255%2C1289',
 'useragent=Mozilla%2F5.0+%28X11%3B+CrOS+x86_64+13904.66.0%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36',
 'visitor_id=017A794441090201900B029CA7100EB3']

In [189]:
# get request with endpoint and params

In [23]:
target_search_parameters = {}
for key_value in parsed_url[4].split('&'):
    print(key_value.split('='))

['key', 'ff457966e64d5e877fdbad070f276d18ecec4a01']
['channel', 'WEB']
['count', '24']
['default_purchasability_filter', 'true']
['include_sponsored', 'true']
['keyword', 'paper+plates']
['offset', '0']
['page', '%2Fs%2Fpaper+plates']
['platform', 'desktop']
['pricing_store_id', '2213']
['scheduled_delivery_store_id', '2213']
['store_ids', '2213%2C3333%2C1249%2C1255%2C1289']
['useragent', 'Mozilla%2F5.0+%28X11%3B+CrOS+x86_64+13904.66.0%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36']
['visitor_id', '017A794441090201900B029CA7100EB3']


In [24]:
#loop through and assign it the second value in the list
target_search_parameters = {}
for key_value in parsed_url[4].split('&'):
    key_value_list = key_value.split('=')
    target_search_parameters[key_value_list[0]]=[key_value_list[1]]

In [25]:
target_search_parameters

{'key': ['ff457966e64d5e877fdbad070f276d18ecec4a01'],
 'channel': ['WEB'],
 'count': ['24'],
 'default_purchasability_filter': ['true'],
 'include_sponsored': ['true'],
 'keyword': ['paper+plates'],
 'offset': ['0'],
 'page': ['%2Fs%2Fpaper+plates'],
 'platform': ['desktop'],
 'pricing_store_id': ['2213'],
 'scheduled_delivery_store_id': ['2213'],
 'store_ids': ['2213%2C3333%2C1249%2C1255%2C1289'],
 'useragent': ['Mozilla%2F5.0+%28X11%3B+CrOS+x86_64+13904.66.0%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36'],
 'visitor_id': ['017A794441090201900B029CA7100EB3']}

In [26]:
#find out if this is a get or post request, near request you see "GET"
target_search_r = requests.get(target_search_endpoint, params=target_search_parameters)


# drill down the json file

In [27]:
# drill down some more, 200 response is good
target_search_r

<Response [200]>

In [28]:
#it's json
target_search_r.text

'{"data":{"search":{"search_suggestions":["plastic utensils","valentines paper plates","colored paper plates","paper ice cream bowls","light pink plastic plates","white plastic plates","birthday paper plates","bridal shower paper plates","baby shower paper plates","rainbow party plates","red plastic plates","light pink paper plates","brown paper plates","paper plates napkins","gold plastic plates","pastel party plates","pastel plastic plates","paper plates for wedding","wedding plastic plates","elegant paper dinner napkins"],"search_recommendations":{"related_categories":[],"related_queries":[]},"search_response":{"facet_list":[{"name":"d_categorytaxonomy","type":"url","display_name":"Category","expand":true,"details":[{"display_name":"Party Supplies","url":"keyword=paper%2Bplates&sort_by=relevance&count=24&offset=0&category=5xt3c","value":"5xt3c"},{"display_name":"Household Essentials","url":"keyword=paper%2Bplates&sort_by=relevance&count=24&offset=0&category=5xsz1","value":"5xsz1"},{

In [29]:
#It's a dictionary, get keys, only one, so drill down wiht data
target_search_r.json()['data']['search'].keys()

dict_keys(['search_suggestions', 'search_recommendations', 'search_response', 'products'])

In [30]:
#we have a list
target_search_r.json()['data']['search']['products']

[{'__typename': 'ProductSummary',
  'tcin': '75666853',
  'original_tcin': '75666853',
  'item': {'relationship_type': 'Stand Alone',
   'relationship_type_code': 'SA',
   'merchandise_classification': {'class_id': 5, 'department_id': 253},
   'eligibility_rules': {'add_on': {'is_active': True},
    'scheduled_delivery': {'is_active': True}},
   'enrichment': {'buy_url': 'https://www.target.com/p/line-plaid-paper-plate-8-5-34-90ct-up-38-up-8482/-/A-75666853',
    'images': {'primary_image_url': 'https://target.scene7.com/is/image/Target/GUEST_39b91919-bb96-44a4-a419-2257cfd40fc5',
     'alternate_image_urls': ['https://target.scene7.com/is/image/Target/GUEST_8ffaba71-1687-4107-9e3d-c49036c358ed']}},
   'dpci': '253-05-0356',
   'cart_add_on_threshold': 35.0,
   'product_description': {'title': 'Line Plaid Paper Plate 8.5&#34; - 90ct - up &#38; up&#8482;',
    'bullet_descriptions': ['<B>Features:</B> Round (shape)',
     '<B>Dimensions (Overall):</B> 8.55 Inches (L), 8.55 Inches (W)',


In [31]:
target_search_r.json()['data']['search']['products'][0]
#the tcin is interesting, could track this product over time, is one reg price different by region?

{'__typename': 'ProductSummary',
 'tcin': '75666853',
 'original_tcin': '75666853',
 'item': {'relationship_type': 'Stand Alone',
  'relationship_type_code': 'SA',
  'merchandise_classification': {'class_id': 5, 'department_id': 253},
  'eligibility_rules': {'add_on': {'is_active': True},
   'scheduled_delivery': {'is_active': True}},
  'enrichment': {'buy_url': 'https://www.target.com/p/line-plaid-paper-plate-8-5-34-90ct-up-38-up-8482/-/A-75666853',
   'images': {'primary_image_url': 'https://target.scene7.com/is/image/Target/GUEST_39b91919-bb96-44a4-a419-2257cfd40fc5',
    'alternate_image_urls': ['https://target.scene7.com/is/image/Target/GUEST_8ffaba71-1687-4107-9e3d-c49036c358ed']}},
  'dpci': '253-05-0356',
  'cart_add_on_threshold': 35.0,
  'product_description': {'title': 'Line Plaid Paper Plate 8.5&#34; - 90ct - up &#38; up&#8482;',
   'bullet_descriptions': ['<B>Features:</B> Round (shape)',
    '<B>Dimensions (Overall):</B> 8.55 Inches (L), 8.55 Inches (W)',
    '<B>Package 

In [32]:
#reassign the keyword
target_search_parameters['keyword']='paper+cups'
target_search_parameters

{'key': ['ff457966e64d5e877fdbad070f276d18ecec4a01'],
 'channel': ['WEB'],
 'count': ['24'],
 'default_purchasability_filter': ['true'],
 'include_sponsored': ['true'],
 'keyword': 'paper+cups',
 'offset': ['0'],
 'page': ['%2Fs%2Fpaper+plates'],
 'platform': ['desktop'],
 'pricing_store_id': ['2213'],
 'scheduled_delivery_store_id': ['2213'],
 'store_ids': ['2213%2C3333%2C1249%2C1255%2C1289'],
 'useragent': ['Mozilla%2F5.0+%28X11%3B+CrOS+x86_64+13904.66.0%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F91.0.4472.114+Safari%2F537.36'],
 'visitor_id': ['017A794441090201900B029CA7100EB3']}

#### Target's aggregation API

In [33]:
# parse the URL so it's easier to read
#copy and paste request from above
target_search_r = requests.get(target_search_endpoint, params=target_search_parameters)

In [2]:
# check the parsed URL

In [197]:
# format the endpoint and parameters

In [198]:
# change something in the parameters (like tcins)

In [199]:
# get request with endpoint and params

In [34]:
# drill down the json file
target_search_r.json()['data']['search']['products'][0]

{'__typename': 'ProductSummary',
 'tcin': '12970172',
 'original_tcin': '12970172',
 'item': {'relationship_type': 'Stand Alone',
  'relationship_type_code': 'SA',
  'merchandise_classification': {'class_id': 5, 'department_id': 253},
  'eligibility_rules': {'add_on': {'is_active': True},
   'scheduled_delivery': {'is_active': True}},
  'enrichment': {'buy_url': 'https://www.target.com/p/dixie-everyday-assorted-designs-cold-cups-54ct-9oz/-/A-12970172',
   'images': {'primary_image_url': 'https://target.scene7.com/is/image/Target/GUEST_9b4d6235-6e2e-4c7c-a541-6babba8eec63',
    'alternate_image_urls': ['https://target.scene7.com/is/image/Target/GUEST_d2d66bf2-7f60-444c-a286-16f29b008b5f',
     'https://target.scene7.com/is/image/Target/GUEST_4f0f4c6a-4310-4db5-b9ff-f5ef64671679',
     'https://target.scene7.com/is/image/Target/GUEST_5e6b372c-c099-4e8b-93b0-8611f6580497']},
   'videos': [{'is_list_page_eligible': False,
     'video_files': [{'mime_type': 'video/mp4',
       'video_url': 

In [35]:
# look for an api that pulls item information, hit clear button in dev tools, look at categories/cleaning supplies
#aggregation api with lots of tcins
target_aggregate_url = 'https://redsky.target.com/redsky_aggregations/v1/web/plp_fulfillment_v1?key=ff457966e64d5e877fdbad070f276d18ecec4a01&tcins=81107269%2C81068829%2C14135567%2C81068792%2C82079503%2C81829962%2C81068790%2C81506339%2C80935950%2C81107259%2C81068797%2C11069188%2C81506334%2C81107271%2C81068773%2C81180792%2C81107267%2C81068789%2C81068796%2C81506336%2C81107268%2C81068821%2C81564691%2C81953908%2C81068815%2C81068825%2C81068787%2C81564688&store_id=2850&zip=11201&state=NY&latitude=40.690&longitude=-74.000&scheduled_delivery_store_id=2850'


In [36]:
parsed_url = urlparse(target_aggregate_url)
parsed_url

ParseResult(scheme='https', netloc='redsky.target.com', path='/redsky_aggregations/v1/web/plp_fulfillment_v1', params='', query='key=ff457966e64d5e877fdbad070f276d18ecec4a01&tcins=81107269%2C81068829%2C14135567%2C81068792%2C82079503%2C81829962%2C81068790%2C81506339%2C80935950%2C81107259%2C81068797%2C11069188%2C81506334%2C81107271%2C81068773%2C81180792%2C81107267%2C81068789%2C81068796%2C81506336%2C81107268%2C81068821%2C81564691%2C81953908%2C81068815%2C81068825%2C81068787%2C81564688&store_id=2850&zip=11201&state=NY&latitude=40.690&longitude=-74.000&scheduled_delivery_store_id=2850', fragment='')

In [37]:
#create a function that pulls out key value pairs, build off before and make variables replaceable, pass it a parsed url
def parse_target_url(url):
    params = {}
    for key_value in url[4].split('&'):
        key_value_list = key_value.split('=')
        params[key_value_list[0]]= key_value_list[1]
    return params

In [38]:
target_aggregate_params = parse_target_url(parsed_url)

In [39]:
target_aggregate_params

{'key': 'ff457966e64d5e877fdbad070f276d18ecec4a01',
 'tcins': '81107269%2C81068829%2C14135567%2C81068792%2C82079503%2C81829962%2C81068790%2C81506339%2C80935950%2C81107259%2C81068797%2C11069188%2C81506334%2C81107271%2C81068773%2C81180792%2C81107267%2C81068789%2C81068796%2C81506336%2C81107268%2C81068821%2C81564691%2C81953908%2C81068815%2C81068825%2C81068787%2C81564688',
 'store_id': '2850',
 'zip': '11201',
 'state': 'NY',
 'latitude': '40.690',
 'longitude': '-74.000',
 'scheduled_delivery_store_id': '2850'}

In [40]:
target_aggregate_endpoint = parsed_url[0] + "://" + parsed_url[1] + parsed_url[2]

In [41]:
target_aggregate_endpoint

'https://redsky.target.com/redsky_aggregations/v1/web/plp_fulfillment_v1'

In [42]:
target_list = urlparse('https://redsky.target.com/redsky_aggregations/v1/web/plp_fulfillment_v1?key=ff457966e64d5e877fdbad070f276d18ecec4a01&tcins=81107269%2C81068829%2C14135567%2C81068792%2C82079503%2C81829962%2C81068790%2C81506339%2C80935950%2C81107259%2C81068797%2C11069188%2C81506334%2C81107271%2C81068773%2C81180792%2C81107267%2C81068789%2C81068796%2C81506336%2C81107268%2C81068821%2C81564691%2C81953908%2C81068815%2C81068825%2C81068787%2C81564688&store_id=2850&zip=11201&state=NY&latitude=40.690&longitude=-74.000&scheduled_delivery_store_id=2850')

In [43]:
target_aggregate_params['tcins'] = '12970172'

In [87]:
target_aggregate_params

{'key': 'ff457966e64d5e877fdbad070f276d18ecec4a01',
 'tcins': '12970172',
 'store_id': '2850',
 'zip': '11201',
 'state': 'NY',
 'latitude': '40.690',
 'longitude': '-74.000',
 'scheduled_delivery_store_id': '2850'}

In [44]:
target_aggregate_r = requests.get(target_aggregate_endpoint, params=target_aggregate_params)

In [45]:
#target_aggregate_r.json()
#lots of apis, trial and error get what you want, this isn't exactly it

In [46]:
#parsed_url = urlparse(target_list)

## Using sessions to login
### Accessing password-protected pages
[Sessions object - request library](https://docs.python-requests.org/en/master/user/advanced/#session-objects)

In [170]:
#don't know what to do here
# open up a session so that your login credentials are saved

In [171]:
# load in config file with passwords

In [172]:
# check the website for the login parameters

In [173]:
# post the payload to the site to login with the correct log in endpoint

In [174]:
# check credentials to see if successful

In [175]:
# look at an example page to get you started with a query

In [177]:
# create a new post object from the example

In [None]:
# post request for the data

In [179]:
# check to see what is returned