# Scraping Daily Weather Data

In [46]:
#Importing the encessary packages:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime

#### **Creating the necessary lists where the data scraped/fetched will reside**

In [49]:
Timestamp = []
Date = []
Time = []
Temperature = []
ReelFeel_Temperature = []
ReelFeel_Temperature_Shade = []
Max_UV_Index = []
Wind = []
Probability_of_Thunderstorms = []
Cloud_Cover = []

#### **Getting the needed webpage from where the scraping will be done**

In [52]:
#Requesting the webpage by using the webdriver:
webpage = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
webpage.get('https://www.accuweather.com/en/ae/dubai/323091/daily-weather-forecast/323091?day=1')

#### **Scraping the content of the webpage using BeatifulSoup**

In [54]:
#Getting the content:
content = webpage.page_source
#Parsing the content result as html using BeatifulSoup:
result = BeautifulSoup(content, 'html.parser')

#### **Fetching the needed data from the result/soup**

In [56]:
#Fetching the section that we will scrape from:
weather_data = result.find_all('div', {'class': 'page-content content-module'})

In [57]:
#Looping through the weather_data and appending the Time and Temperature lists:
for item in weather_data:
    for time in item.find_all('div', {'class': 'half-day-card-header__title'}):
        Time.append(time.find('h2', {'class': 'title'}).string)
        
    for temp in item.find_all('div', {'class': 'weather'}):
        Temperature.append(temp.find('div', {'class': 'temperature'}).text.replace('\n', '').replace('\t', ''))

In [58]:
#Creating the timestamp for each record:
Date_and_time = datetime.today()
Timestamp.append(Date_and_time) #during day
Timestamp.append(Date_and_time) #during night

#Based on the complexity of retrieving data using HTML, I am using XPATH which is a simpler approach:
Date.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[1]/div').text) # date matching the day
Date.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[1]/div').text) #date matching the night

Max_UV_Index.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/p[1]/span').text)
Max_UV_Index.append('') #appending null since this data is not relevant at night

Wind.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/p[2]/span').text) #during day
Wind.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[4]/div[2]/div[2]/div[1]/p[1]/span').text) #during night

Probability_of_Thunderstorms.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[2]/p[1]/span').text) #during day
Probability_of_Thunderstorms.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[4]/div[2]/div[2]/div[2]/p[1]/span').text) #during night

Cloud_Cover.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[2]/p[3]/span').text) #during day
Cloud_Cover.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[4]/div[2]/div[2]/div[2]/p[3]/span').text) #during night

ReelFeel_Temperature.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div[2]/div[1]').text) #during day
ReelFeel_Temperature.append(webpage.find_element(By.XPATH,'/html/body/div/div[7]/div[1]/div[1]/div[4]/div[1]/div[2]/div[2]/div').text) #during night

ReelFeel_Temperature_Shade.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div[2]/div[2]/div').text)
ReelFeel_Temperature_Shade.append('') #appending null since this data is not relevant at night

#### **Wrangling and Showing the created lists**

In [65]:
Date

['TUESDAY, JULY 2', 'TUESDAY, JULY 2']

In [67]:
Time

['Day', 'Night']

In [69]:
Temperature

['44°Hi', '32°Lo']

In [71]:
ReelFeel_Temperature

['RealFeel® 46°', 'RealFeel® 36°']

In [73]:
#Cleaning the ReelFeel_Temperature list by keeping only the temperature measure:
ReelFeel = []
for i in ReelFeel_Temperature:
    ReelFeel.append(i.split(' ')[-1])

In [75]:
ReelFeel

['46°', '36°']

In [77]:
ReelFeel_Temperature_Shade

['RealFeel Shade™ 42°', '']

In [79]:
#Cleaning the ReelFeel_Temperature_Shade list by keeping only the temperature measure:
ReelFeel_Shade = []
for i in ReelFeel_Temperature_Shade:
    ReelFeel_Shade.append(i.split(' ')[-1])

In [81]:
ReelFeel_Shade

['42°', '']

In [83]:
Max_UV_Index

['13 Extreme', '']

In [85]:
#Splitting the Max_UV_Index list and creating an additional list that holds the severity of the UV index:
UV_Index = []
UV_Index_Severity = []
for i in Max_UV_Index:
    UV_Index.append(i.split(' ')[0])
    UV_Index_Severity.append(i[2:].lstrip(' '))                           

In [87]:
UV_Index

['13', '']

In [89]:
UV_Index_Severity

['Extreme', '']

In [91]:
Wind

['WSW 17 km/h', 'E 7 km/h']

In [93]:
Probability_of_Thunderstorms

['0%', '0%']

In [95]:
Cloud_Cover

['2%', '0%']

#### **Creating a dataframe out of the lists**

In [98]:
weather_df_day = pd.DataFrame({'Timestamp': Timestamp, 'Date': Date, 'Time': Time, 'Temperature': Temperature, 'ReelFeel': ReelFeel, 'ReelFeel_Shade': ReelFeel_Shade, 'UV_Index': UV_Index, \
                                 'UV_Index_Severity': UV_Index_Severity, 'Wind': Wind, 'Probability_of_Thunderstorms': Probability_of_Thunderstorms, 'Cloud_Cover': Cloud_Cover})

In [100]:
weather_df_day

Unnamed: 0,Timestamp,Date,Time,Temperature,ReelFeel,ReelFeel_Shade,UV_Index,UV_Index_Severity,Wind,Probability_of_Thunderstorms,Cloud_Cover
0,2024-07-02 13:35:23.382927,"TUESDAY, JULY 2",Day,44°Hi,46°,42°,13.0,Extreme,WSW 17 km/h,0%,2%
1,2024-07-02 13:35:23.382927,"TUESDAY, JULY 2",Night,32°Lo,36°,,,,E 7 km/h,0%,0%


#### **Converting the Dataframe into CSV Format and Storing it into the Local Machine**

In [103]:
weather_df_day.to_csv(r"C:\Users\cesar\OneDrive\Documents\Cesar documents\Data Science Projects\Weather Data Streaming Project\Streaming Datasets - CSV\weather_day1.csv", index=False)

# Sinking the Streamed CSV Files into a PostgreSQL table using COPY

#### **Connecting to PostgreSQL Database (<span style="color:blue">this section should run only once within the pipeline</span>)**

In [107]:
#Importing the necessary libraries in order to connect to the Database:
import psycopg2
from sqlalchemy import create_engine

In [109]:
#Establishing the connection to the Database:
conn = psycopg2.connect(host='127.0.0.1',
        port=5432,
        database='dbcesar',
        user='postgres',
        password='Trust_no1',
        connect_timeout=3)

In [111]:
#Creating the engine that will be responsible to execute sql queries in the Database:
engine = create_engine('postgresql://postgres:Trust_no1@127.0.0.1:5432/dbcesar')

In [113]:
#Creating the table in the Database using ONLY the first scraped dataframe - this approach is needed to avoid creating the table manually directly into the Database:
weather_df_day.to_sql('daily_weather_streaming', engine, schema='public', index=False, if_exists='replace')

2

In [115]:
#Committing the changes to the Databse:
conn.commit()

#### **Copying the streamed Dataframe one by one into the Database**

In [None]:
#Writing the datafrmae/file into Postgres using autocommit:
from sqlalchemy.sql import text

#write_df = engine.execute(text('''copy public.daily_weather_streaming ("Timestamp", "Date", "Time", "Temperature", "ReelFeel", "ReelFeel_Shade", "UV_Index", "UV_Index_Severity", "Wind", "Probability_of_Thunderstorms", "Cloud_Cover") 
                                #from 'C:\\Users\\cesar\\OneDrive\\Documents\\Cesar documents\\Data Science Projects\\Weather Data Streaming Project\\Streaming Datasets - CSV\\weather_day1.csv' CSV HEADER;''').execution_options(autocommit=True))

cur = conn.cursor()
with open('C:\\Users\\cesar\\OneDrive\\Documents\\Cesar documents\\Data Science Projects\\Weather Data Streaming Project\\Streaming Datasets - CSV\\weather_day1.csv') as f:
    cur.copy_expert('COPY public.daily_weather_streaming ("Timestamp", "Date", "Time", "Temperature", "ReelFeel", "ReelFeel_Shade", "UV_Index", "UV_Index_Severity", "Wind", "Probability_of_Thunderstorms", "Cloud_Cover") FROM STDIN WITH HEADER CSV', f)
    conn.commit()