In [41]:
# New module
import requests

# Known modules
import json
import pandas as pd


## Review: JSON

In [42]:
# load json file from "~/Downloads/scores.json"
filename = "/Users/annameyer/Downloads/scores.json"
with open(filename, 'r') as f:
    scores = json.load(f)
scores

{'alice': 100, 'bob': 200, 'cindy': 300}

What is the type of `scores`?

In [45]:
type(scores)

dict

What are they keys in `scores`?

In [46]:
scores.keys()

dict_keys(['alice', 'bob', 'cindy'])

What is the value of `scores["bob"]`?

In [47]:
scores["bob"]

200

Let's look at a more interesting JSON file.

Kiva is a microlending platform that gives out small loans to people around the world.

We'll load in a JSON file containing details about people who got loans through Kiva.

In [48]:
filename = "/Users/annameyer/Downloads/kiva.json"
with open(filename, 'r') as f:
    kiva = json.load(f) 

The file is large, so we won't print it all.

In [49]:
# print type
type(kiva)

dict

In [50]:
# print keys
kiva.keys()

dict_keys(['data'])

In [51]:
# explore data
type(kiva["data"])

dict

In [52]:
kiva["data"].keys()

dict_keys(['lend'])

We'll continue through this process until eventually we reach data (usually, when it stops being a dictionary)

In [53]:
type(kiva['data']['lend']['loans']['values'])

list

In [54]:
type(kiva['data']['lend']['loans']['values'][0])

dict

In [55]:
kiva['data']['lend']['loans']['values'][0].keys()

dict_keys(['name', 'description', 'loanAmount', 'geocode'])

Print the name of each loan recipient

In [56]:
# print the name of each person
for i in kiva['data']['lend']['loans']['values']:
    print(i['name'])

Polikseni
Safarmo
Elizabeth
Ester
Cherifa


See what Elizabeth's loan amount was, and what country she lives in

In [57]:
# first, see structure of geocode
print(type(kiva['data']['lend']['loans']['values'][0]['geocode']))
print(kiva['data']['lend']['loans']['values'][0]['geocode'].keys())
print(type(kiva['data']['lend']['loans']['values'][0]['geocode']['country']))
print(kiva['data']['lend']['loans']['values'][0]['geocode']['country'].keys())


<class 'dict'>
dict_keys(['city', 'country'])
<class 'dict'>
dict_keys(['name', 'region', 'fundsLentInCountry'])


In [58]:
# iterate through kiva['data']['lend']['loans']['values']
for i in kiva['data']['lend']['loans']['values']:
    if i["name"] == "Elizabeth":
        print(i["loanAmount"])
        print(i["geocode"]["country"]["name"])

800.00
Kenya


## Warm up: Scraping text
The easiest case is when a webpage contains only text, such as [https://raw.githubusercontent.com/annapmeyer/annapmeyer.github.io/refs/heads/anna/files/teaching/demo/hello.txt](https://raw.githubusercontent.com/annapmeyer/annapmeyer.github.io/refs/heads/anna/files/teaching/demo/hello.txt)

We'll use the Python library `requests`.

In [4]:
url = "https://raw.githubusercontent.com/annapmeyer/annapmeyer.github.io/refs/heads/anna/files/teaching/demo/hello.txt"
r = requests.get(url) # r is the response
print(r.status_code)
print(r.text)

200
Hello students! This is a text file.


In [5]:
typo_url = "https://raw.githubusercontent.com/annapmeyer/annapmeyer.github.io/refs/heads/anna/files/teaching/demo/hello.txttttt"
r = requests.get(typo_url)
print(r.status_code)
print(r.text)

404
404: Not Found


In [60]:
# We can check for a status_code error by using an assert
typo_url = "https://raw.githubusercontent.com/annapmeyer/annapmeyer.github.io/refs/heads/anna/files/teaching/demo/hello.txttttt"
r = requests.get(typo_url)
assert r.status_code == 200
print(r.status_code)
print(r.text)

AssertionError: 

In [61]:
# Let's try to catch that error

try:
    r = requests.get(typo_url)
    r.raise_for_status() #similar to asserting r.status_code == 200
    print(r.text)
except requests.HTTPError as e:
    print("oops!!", e)
    

oops!! 404 Client Error: Not Found for url: https://raw.githubusercontent.com/annapmeyer/annapmeyer.github.io/refs/heads/anna/files/teaching/demo/hello.txttttt


## Scraping JSON

- `json.load` (FILE_OBJECT)
- `json.loads` (STRING)

In [9]:
# GETting a JSON file, the long way
url = "https://raw.githubusercontent.com/annapmeyer/annapmeyer.github.io/refs/heads/anna/files/teaching/demo/scores.json"
r = requests.get(url)
r.raise_for_status()
urltext = r.text
print(urltext)
d = json.loads(urltext)
print(type(d), d)

{
    "alice": 100,
    "bob": 200,
    "cindy": 300
}
<class 'dict'> {'alice': 100, 'bob': 200, 'cindy': 300}


In [10]:
# GETting a JSON file, the shortcut way
url = "https://cs220.cs.wisc.edu/scores.json"
#Shortcut to bypass using json.loads()
r = requests.get(url)
r.raise_for_status()
d2 = r.json()
print(type(d2), d2)

<class 'dict'> {'alice': 100, 'bob': 200, 'cindy': 300}


### Explore real-world JSON

How to explore an unknown JSON?
- If you run into a `dict`, try `.keys()` method to look at the keys of the dictionary, then use lookup process to explore further
- If you run into a `list`, iterate over the list and print each item

#### Weather for Northampton, MA
- URL: https://api.weather.gov/gridpoints/BOX/18,79/forecast

In [11]:
# GET the forecast
url = "https://api.weather.gov/gridpoints/BOX/18,79/forecast"
r = requests.get(url)
r.raise_for_status()
weather_data = r.json()

# Explore the type of the data structure 
print(type(weather_data))

# display the data
weather_data


<class 'dict'>


{'@context': ['https://geojson.org/geojson-ld/geojson-context.jsonld',
  {'@version': '1.1',
   'wx': 'https://api.weather.gov/ontology#',
   'geo': 'http://www.opengis.net/ont/geosparql#',
   'unit': 'http://codes.wmo.int/common/unit/',
   '@vocab': 'https://api.weather.gov/ontology#'}],
 'type': 'Feature',
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-72.6741446, 42.3344671],
    [-72.67896590000001, 42.313014599999995],
    [-72.64994080000001, 42.309446099999995],
    [-72.64511340000001, 42.330898299999994],
    [-72.6741446, 42.3344671]]]},
 'properties': {'units': 'us',
  'forecastGenerator': 'BaselineForecastGenerator',
  'generatedAt': '2024-11-05T22:40:56+00:00',
  'updateTime': '2024-11-05T21:03:45+00:00',
  'validTimes': '2024-11-05T15:00:00+00:00/P8DT6H',
  'elevation': {'unitCode': 'wmoUnit:m', 'value': 60.0456},
  'periods': [{'number': 1,
    'name': 'This Afternoon',
    'startTime': '2024-11-05T17:00:00-05:00',
    'endTime': '2024-11-05T18:00:00-05:00',
    '

In [12]:
# TODO: display the keys of the weather_data dict
print(weather_data.keys())

# TODO: lookup the value corresponding to the 'properties'
print(weather_data['properties'])

# TODO: you know what to do next ... explore type again
print(type(weather_data['properties']))

dict_keys(['@context', 'type', 'geometry', 'properties'])
{'units': 'us', 'forecastGenerator': 'BaselineForecastGenerator', 'generatedAt': '2024-11-05T22:40:56+00:00', 'updateTime': '2024-11-05T21:03:45+00:00', 'validTimes': '2024-11-05T15:00:00+00:00/P8DT6H', 'elevation': {'unitCode': 'wmoUnit:m', 'value': 60.0456}, 'periods': [{'number': 1, 'name': 'This Afternoon', 'startTime': '2024-11-05T17:00:00-05:00', 'endTime': '2024-11-05T18:00:00-05:00', 'isDaytime': True, 'temperature': 72, 'temperatureUnit': 'F', 'temperatureTrend': '', 'probabilityOfPrecipitation': {'unitCode': 'wmoUnit:percent', 'value': None}, 'windSpeed': '7 mph', 'windDirection': 'S', 'icon': 'https://api.weather.gov/icons/land/day/few?size=medium', 'shortForecast': 'Sunny', 'detailedForecast': 'Sunny, with a high near 72. South wind around 7 mph.'}, {'number': 2, 'name': 'Tonight', 'startTime': '2024-11-05T18:00:00-05:00', 'endTime': '2024-11-06T06:00:00-05:00', 'isDaytime': False, 'temperature': 57, 'temperatureUnit

In [13]:
# TODO: display the keys of the properties dict
print(weather_data['properties'].keys())

# TODO: lookup the value corresponding to the 'periods'
print(weather_data['properties']['periods'])

# TODO: you know what to do next ... explore type again
print(type(weather_data['properties']['periods']))

dict_keys(['units', 'forecastGenerator', 'generatedAt', 'updateTime', 'validTimes', 'elevation', 'periods'])
[{'number': 1, 'name': 'This Afternoon', 'startTime': '2024-11-05T17:00:00-05:00', 'endTime': '2024-11-05T18:00:00-05:00', 'isDaytime': True, 'temperature': 72, 'temperatureUnit': 'F', 'temperatureTrend': '', 'probabilityOfPrecipitation': {'unitCode': 'wmoUnit:percent', 'value': None}, 'windSpeed': '7 mph', 'windDirection': 'S', 'icon': 'https://api.weather.gov/icons/land/day/few?size=medium', 'shortForecast': 'Sunny', 'detailedForecast': 'Sunny, with a high near 72. South wind around 7 mph.'}, {'number': 2, 'name': 'Tonight', 'startTime': '2024-11-05T18:00:00-05:00', 'endTime': '2024-11-06T06:00:00-05:00', 'isDaytime': False, 'temperature': 57, 'temperatureUnit': 'F', 'temperatureTrend': '', 'probabilityOfPrecipitation': {'unitCode': 'wmoUnit:percent', 'value': None}, 'windSpeed': '9 mph', 'windDirection': 'S', 'icon': 'https://api.weather.gov/icons/land/night/few?size=medium',

In [None]:
# TODO: extract periods list into a variable
periods_list = weather_data['properties']['periods']

# TODO: create a DataFrame using periods_list
period_df = pd.DataFrame(periods_list)

# TODO: What does each inner data structure represent in your DataFrame?
#       Keep in mind that outer data structure is a list.

period_df.head()

What is the maximum and minimum observed temperatures? Include the temperatureUnit in your display

In [None]:
min_temp = period_df['temperature'].min()
idx_min = period_df['temperature'].idxmin()
min_unit = period_df.loc[idx_min, 'temperatureUnit']

max_temp = period_df['temperature'].max()
idx_max = period_df['temperature'].idxmax()
max_unit = period_df.loc[idx_max, 'temperatureUnit']

print("Minimum observed temperature is: {} degree {}".format(min_temp, min_unit))
print("Maximum observed temperature is: {} degree {}".format(max_temp, max_unit))

Which days' `detailedForecast` contain `thunderstorms`?

In [None]:
storm_days_df = period_df[period_df["detailedForecast"].str.contains("thunderstorms")]
storm_days_df

 Which day's `detailedForecast` has the most lengthy description?

In [None]:
idx_max_desc = period_df['detailedForecast'].str.len().idxmax()
period_df.iloc[idx_max_desc]['name']

Save the data as a CSV file

In [None]:
period_df.to_csv("weather.csv", index=False)