# Weather Scrapper

This is a notebook for instruction into how we can scrap data from https://www.timeanddate.com/weather/@3427761/historic?month=7&year=2010

## Step 1. Import Modules

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

## Step 2. Function to Retrieve Weather Data

scrape_weather_data has 3 parameters:  
1. Year
2. Month
3. Day

The idea is to retrieve data from 2009 to 2023 from the website's table and then generate a dataframe and csv file with the data.

In [None]:
def scrape_weather_data(year, month, day):

    # 1. set URL
    url = f'https://www.timeanddate.com/weather/@3427761/historic?month={month}&year={year}&day={day}' # month, year, and day will be the parameters

    # 2. request
    response = requests.get(url)

    # 3. Find the table from the content of the response
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find(id='wt-his')

    # 3.1 Validate the existence of the table
    if table is not None:
        # 4 .Extract data from each row
        data_rows = []
        rows = table.find_all('tr')

        # 5. Now we loop through all the table's cells
        for row in rows:
            cells = row.find_all(['td', 'th'])
            if len(cells) == 9:  # Check if the row has the expected number of cells
                time = cells[0].text.strip()

                # Skip rows that do not contain valid time information. This will make the code to not break if it didn't find information.
                if ':' not in time:
                    continue

                # 6. Set variables with cell's content. We will use this for the dataframe columns
                temperature = cells[2].text.strip()
                weather = cells[3].text.strip()
                wind_speed = cells[4].text.strip()
                wind_direction = cells[5].text.strip()
                humidity = cells[6].text.strip()
                barometer = cells[7].text.strip()
                visibility = cells[8].text.strip()

                # 7. Split time string to extract day
                date_str = f"{year}-{month}-{day}"
                date = datetime.strptime(date_str, '%Y-%m-%d').date()

                row_data = [date, time, temperature, weather, wind_speed, wind_direction, humidity, barometer, visibility]
                data_rows.append(row_data)

        # 8. Convert the list of rows into a DataFrame
        df = pd.DataFrame(data_rows, columns=['Date', 'Time', 'Temperature', 'Weather', 'Wind Speed', 'Wind Direction', 'Humidity', 'Barometer', 'Visibility'])

    else:
        # If the table is not found
        print("The table was not found on the page.")
        # Assign default values of zero to the dataframe.
        df = pd.DataFrame({'Date': [0], 'Time': [0], 'Temperature': [0], 'Weather': [0], 'Wind Speed': [0], 'Wind Direction': [0], 'Humidity': [0], 'Barometer': [0], 'Visibility': [0]})

    return df


## Step 3. Call Function

Once the fucntion is ready, we will look from 2009 to 2023 to get all the possible data.

In [None]:
# Define a variable that will contain the list
all_data = []

# Loop from 2009 to 2023
for year in range(2009, 2023):
    for month in range(1, 13):

        # Get the number of days in the month
        days_in_month = 31 if month in [1, 3, 5, 7, 8, 10, 12] else 30 if month in [4, 6, 9, 11] else 28 if year % 4 != 0 or (year % 100 == 0 and year % 400 != 0) else 29

        for day in range(1, days_in_month + 1):
            df = scrape_weather_data(year, month, day)
            all_data.append(df)

            # Print progress
            print(f"Scraped data for {year}-{month}-{day}")

# Concatenate all dataframes into a single dataframe
final_df = pd.concat(all_data, ignore_index=True)
final_df.head()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scraped data for 2009-4-24
Scraped data for 2009-4-25
Scraped data for 2009-4-26
Scraped data for 2009-4-27
Scraped data for 2009-4-28
Scraped data for 2009-4-29
Scraped data for 2009-4-30
Scraped data for 2009-5-1
Scraped data for 2009-5-2
Scraped data for 2009-5-3
Scraped data for 2009-5-4
Scraped data for 2009-5-5
Scraped data for 2009-5-6
Scraped data for 2009-5-7
Scraped data for 2009-5-8
Scraped data for 2009-5-9
Scraped data for 2009-5-10
Scraped data for 2009-5-11
Scraped data for 2009-5-12
Scraped data for 2009-5-13
Scraped data for 2009-5-14
Scraped data for 2009-5-15
Scraped data for 2009-5-16
Scraped data for 2009-5-17
Scraped data for 2009-5-18
Scraped data for 2009-5-19
Scraped data for 2009-5-20
Scraped data for 2009-5-21
Scraped data for 2009-5-22
Scraped data for 2009-5-23
Scraped data for 2009-5-24
Scraped data for 2009-5-25
Scraped data for 2009-5-26
Scraped data for 2009-5-27
Scraped data for 2009-5-28

Unnamed: 0,Date,Time,Temperature,Weather,Wind Speed,Wind Direction,Humidity,Barometer,Visibility
0,2009-01-01,20:00do 15 feb,24 °C,Clear.,24 km/h,↑,65%,1017 mbar,16 km
1,2009-01-01,19:00,25 °C,Sunny.,24 km/h,↑,57%,1017 mbar,16 km
2,2009-01-01,18:00,25 °C,Sunny.,19 km/h,↑,54%,1017 mbar,16 km
3,2009-01-01,17:00,25 °C,Sunny.,19 km/h,↑,57%,1017 mbar,16 km
4,2009-01-01,16:00,26 °C,Passing clouds.,9 km/h,↑,51%,1017 mbar,


## Step 4. Save CSV file

In [None]:
final_df.to_csv('tigre_historical_weather_data.csv')

Choose whether you want to download locally or into your Google Drive

In [None]:
# Code to download the CSV file from your Google Drive to your local machine

from google.colab import files

# Specify the file path in Google Drive
file_path = '/content/drive/MyDrive/tigre_historical_weather_data.csv'

# Download the CSV file from Google Drive
files.download(file_path)


FileNotFoundError: Cannot find file: /content/drive/MyDrive/tigre_historical_weather_data.csv

In [None]:
# Code to save the CSV file to your Google Drive from Google Colab

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Save the CSV file to your Google Drive

final_df.to_csv('/content/drive/MyDrive/tigre_historical_weather_data.csv')

Mounted at /content/drive
