In [1]:
import requests
from requests import get 
from bs4 import BeautifulSoup
import json
import pandas as pd 

# Initialized constant to crawl

In [None]:
location = ['Son%20La%20Viet%20Nam','Lang%20Son',
            'Hanoi', 'Nghe%20An', 'Da%20Nang', 'Lam%20Dong',
            ]
location_name = ['SonLa', 'LangSon']

base_api = 'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{location}/{date1}/{date2}?key={api_key}'
parameter = { 
    'include' : 'days',
    'unitGroup': 'metric'
}

years = [2019, 2021, 2023]
day_base = ['-01-01','-12-31'] 

# CallAPI() function 
- Parameters:
    - timesIndex: used to count the times to crawl per day 
        - Because using free API key constraint this code to crawl 1000 indexes perday
    - location_str: Location string using in API parameters
    - location_name: Location name to store backup datas to files in case there's error in runtime
- Idea: 
    - Call the API using the structure defined in the documents
    - Get the data in the binary form
    - Decode binary data to utf-8
    - Parse data using json module
    - Return the parsed json data
- Warnings:  
    - API key must be stored in 'key.txt' file in the same directory as this code

In [3]:
def callAPI(timesIndex: int, location_str: str, location_name: str) -> list: 
    # Get api key from file 
    key_file = open('key.txt','r')
    key_content = key_file.read()
    key_file.close()
    
    # construct api link
    day1 = str(years[timesIndex]) + day_base[0]

    # Third time just call 1 year 
    if timesIndex == 2: 
        day2 = str(years[timesIndex] + 1) + day_base[0]
    else:
        day2 = str(years[timesIndex] + 1) + day_base[1]

    run_api = base_api.format(location=location_str, 
                              date1=day1,
                              date2=day2,
                              api_key=key_content)

    # Call the api 
    try: 
        request_content = get(run_api, params= parameter)
        if request_content.status_code != 200:
            raise 'Get() method failed'
    except Exception:
        return []

    # Decode binary data to text
    json_content = request_content.content
    json_content = json_content.decode('utf-8')

    # Create backups
    jsonFile = open('jsonData'+str(timesIndex)+ location_name +  '.txt','+a')
    jsonFile.write(json_content)
    jsonFile.close()

    # Parse json 
    json_values = json.loads(json_content)
    return json_values


# convert_data_frame() function
- Parameters
    - json_values: parsed json datas from the callAPI() function
    - location_name: location's name to store to file 
    - times: used to name files 
- Idea: 
    - Get every single day data as object 
    - Append all values to a 2D array 
    - Get all columns's name 
    - Pass through pd.DataFrame function to convert python map to dataframe

In [None]:
def convert_data_frame(json_values, location_name, times) -> pd.DataFrame: 
    # Get attributes length
    datas = json_values['days']
    day_temp = datas[0]
    feature_count = len(day_temp)


    feature_name = []
    # Get attributes's name
    for attri in day_temp: 
        feature_name.append(attri)

    values = [[] for x in range(feature_count + 1)]
    print(values)

    # Get attributes values
    for day in datas:
        i = 0
        for attribute in day: 
            print(i)
            values[i].append(day[attribute])
            
            i = i + 1

    # Construct dataframe
    df_map = {}
    for i in range(feature_count):
        df_map.update({feature_name[i] : values[i]})
    raw_df = pd.DataFrame(df_map)
    raw_df.to_csv(location_name +str(times) + '.csv',sep=',')
    return raw_df

# CrawlData() function 
- call 2 functions above

In [5]:
def crawlData(location_link: str, location_alias: str, day: int = 0): 
    json_data = callAPI(timesIndex=day, location_str=location_link,location_name=location_alias)
    raw_df = convert_data_frame(json_values=json_data,location_name=location_alias, times=day)
    print(raw_df.head())
    # use for debugging
    return json_data, raw_df


# Crawl data for every single day  
- Notes: both location day 2 can be crawled at the same day because there's just 1 year each

In [None]:
json_deb, df_deb = crawlData(location[0],location_name[0],0)

In [None]:
json_deb, df_deb = crawlData(location[0],location_name[0],1)

In [None]:
json_deb, df_deb = crawlData(location[0],location_name[0],2)

In [None]:
json_deb, df_deb = crawlData(location[1],location_name[1],0)

In [None]:
json_deb, df_deb = crawlData(location[1],location_name[1],1)

In [None]:
json_deb, df_deb = crawlData(location[1],location_name[1],2)

# Merge all discrete csv files to a single location csv file

In [None]:
local_list = ['LangSon', 'SonLa']

for location in local_list: 
    df_list = []
    for i in range(3): 
        df_list.append(pd.read_csv(location + str(i) + '.csv'))
    full_df = pd.concat(df_list)
    full_df.to_csv(location + '.csv')
