# Historical Weather Data Scrapper

This notebook has instructions for how to scrap historical weather data from https://www.timeanddate.com/weather/@3427761/historic?month=7&year=2010

## 0. Import Modules

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

## 1. Function to Retrieve Weather Data

scrape_weather_data has 3 parameters:  
1. Year
2. Month
3. Day

The idea is to retrieve data from 2009 to 2023 from the website's table and then generate a dataframe and csv file with the data.

In [3]:
def scrape_weather_data(year, month, day):

    # 1. set URL
    url = f'https://www.timeanddate.com/weather/@3427761/historic?month={month}&year={year}&day={day}' # month, year, and day will be the parameters

    # 2. request
    response = requests.get(url)

    # 3. Find the table from the content of the response
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find(id='wt-his')

    # 3.1 Validate the existence of the table
    if table is not None:
        # 4 .Extract data from each row
        data_rows = []
        rows = table.find_all('tr')

        # 5. Now we loop through all the table's cells
        for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) == 9:  # Check if the row has the expected number of cells
                time = cells[0].text.strip()

                # Skip rows that do not contain valid time information. This will make the code to not break if it didn't find information.
                if ':' not in time:
                    continue

                # 6. Set variables with cell's content. We will use this for the dataframe columns
                temperature = cells[2].text.strip()
                weather = cells[3].text.strip()
                wind_speed = cells[4].text.strip()
                wind_direction = cells[5].text.strip()
                humidity = cells[6].text.strip()
                barometer = cells[7].text.strip()
                visibility = cells[8].text.strip()

                # 7. Split time string to extract day
                date_str = f"{year}-{month}-{day}"
                date = datetime.strptime(date_str, '%Y-%m-%d').date()

                row_data = [date, time, temperature, weather, wind_speed, wind_direction, humidity, barometer, visibility]
                data_rows.append(row_data)

        # 8. Convert the list of rows into a DataFrame
        df = pd.DataFrame(data_rows, columns=['Date', 'Time', 'Temperature', 'Weather', 'Wind Speed', 'Wind Direction', 'Humidity', 'Barometer', 'Visibility'])

    else:
        # If the table is not found
        print("The table was not found on the page.")
        # Assign default values of zero to the dataframe.
        df = pd.DataFrame({'Date': [0], 'Time': [0], 'Temperature': [0], 'Weather': [0], 'Wind Speed': [0], 'Wind Direction': [0], 'Humidity': [0], 'Barometer': [0], 'Visibility': [0]})

    return df

## 2. Call Function

For scrapping data from several years, you can set a for loop to iterate over years and months.  

This is an example of how you can do it. In this case we retrieve data from only 2022 because we only had tide data from Tigre from 2022.  

Note: this can take a while

In [None]:
# Define a variable that will contain the list
all_data = []

# Loop from 2022 to 2023
for year in range(2022, 2024):
    for month in range(1, 13):

        # Get the number of days in the month
        days_in_month = 31 if month in [1, 3, 5, 7, 8, 10, 12] else 30 if month in [4, 6, 9, 11] else 28 if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0) else 29

        for day in range(1, days_in_month + 1):
            df = scrape_weather_data(year, month, day)
            all_data.append(df)

            # Print progress
            print(f"Scraped data for {year}-{month}-{day}")

# Concatenate all dataframes into a single dataframe
final_df = pd.concat(all_data, ignore_index=True)
final_df.head()

## 3. Save CSV file

Don't forget to save your csv file once you scrapped your data

In [7]:
final_df.to_csv('tigre_weather_2022-2023.csv')