In [24]:
import os
import json
import requests

import pandas as pd
from dotenv import load_dotenv

load_dotenv();

# Helpers

In [2]:
HCM_DISTRICTS = [
    "Thu Duc",
    "Quan 1",
    "Quan 3",
    "Quan 4",
    "Quan 5",
    "Quan 6",
    "Quan 7",
    "Quan 8",
    "Quan 10",
    "Quan 11",
    "Quan 12",
    "Quan Binh Tan",
    "Quan Binh Thanh",
    "Quan Go Vap",
    "Quan Phu Nhuan",
    "Quan Tan Binh",
    "Quan Tan Phu",
    "Huyen Binh Chanh",
    "Huyen Can Gio",
    "Huyen Cu Chi",
    "Huyen Hoc Mon",
    "Huyen Nha Be",
]

In [6]:
def get_weather_data(location, date1, date2=None):
    """
    Collect weather data from API:
        https://www.visualcrossing.com/resources/documentation/weather-api/timeline-weather-api/
    """
    
    BASE_URL = r"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline"
    PARAMS = {
        "unitGroup": "metric",
        "key": os.getenv("API_KEY"),
        "contentType": "json"
    }

    url = f"{BASE_URL}/{location}/{date1}"
    if date2:
        url = f"{url}/{date2}"
    res = requests.get(url, params=PARAMS)
    
    if res.status_code != 200:
        raise Exception(res.text)
    
    return res.json()

In [22]:
def create_dataframe(raw_data, fields):
    """
    Create pandas.DataFrame object from the raw data
    args:
        + raw_data (dict): JSON data get from the API
        + fields (list[str]): List of fields to keep, should be selected from:
            [   'datetime', 'datetimeEpoch', 'temp', 'feelslike', 'humidity', 'dew', 'precip',
                'precipprob', 'snow', 'snowdepth', 'preciptype', 'windgust', 'windspeed',
                'winddir', 'pressure', 'visibility', 'cloudcover', 'solarradiation', 'solarenergy',
                'uvindex', 'severerisk', 'conditions', 'icon', 'stations', 'source']
    """
    data = {col: [] for col in ["datetime", "location"] + fields}
    for day_data in raw_data["days"]:
        for hour_data in day_data["hours"]:
            data["datetime"].append(day_data["datetime"] + " " + hour_data["datetime"])
            data["location"].append(raw_data["resolvedAddress"])
            for field in fields:
                data[field].append(hour_data[field])
    
    df = pd.DataFrame(data)
    return df.head()

Unnamed: 0,datetime,location,precip,conditions,icon
0,2023-09-07 00:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
1,2023-09-07 01:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
2,2023-09-07 02:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
3,2023-09-07 03:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
4,2023-09-07 04:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night


# Run

In [43]:
date1 = "2023-09-08"
date2 = "2023-09-09"
fields = ["precip", "conditions", "icon"]
df = None

for district in HCM_DISTRICTS:
    raw_data = get_weather_data(f"{district}, Ho Chi Minh, Viet Nam", date1, date2)
    json.dump(raw_data, open(f"./data/raw/{district}.json", "w"))
    new_df = create_dataframe(raw_data, fields)
    if df is None:
        df = new_df
    df = pd.concat([df, new_df])

df.to_csv("./data/preprocessed/all_districts.csv", index=False)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115 entries, 0 to 4
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    115 non-null    object 
 1   location    115 non-null    object 
 2   precip      115 non-null    float64
 3   conditions  115 non-null    object 
 4   icon        115 non-null    object 
dtypes: float64(1), object(4)
memory usage: 5.4+ KB


In [45]:
df.head()

Unnamed: 0,datetime,location,precip,conditions,icon
0,2023-09-08 00:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
1,2023-09-08 01:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
2,2023-09-08 02:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
3,2023-09-08 03:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
4,2023-09-08 04:00:00,"Thủ Đức, Hồ Chí Minh, Việt Nam",0.0,Partially cloudy,partly-cloudy-night
