# Scraping Daily Weather Data

In [17]:
#Importing the encessary packages:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

#### **Creating the necessary lists where the data scraped/fetched will reside**

In [202]:
Date = []
Time = []
Temperature = []
ReelFeel_Temperature = []
ReelFeel_Temperature_Shade = []
Max_UV_Index = []
Wind = []
Probability_of_Thunderstorms = []
Cloud_Cover = []

#### **Getting the needed webpage from where the scraping will be done**

In [203]:
#Requesting the webpage by using the webdriver:
webpage = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
webpage.get('https://www.accuweather.com/en/ae/dubai/323091/daily-weather-forecast/323091?day=6')

#### **Scraping the content of the webpage using BeatifulSoup**

In [204]:
#Getting the content:
content = webpage.page_source
#Parsing the content result as html using BeatifulSoup:
result = BeautifulSoup(content, 'html.parser')

#### **Fetching the needed data from the result/soup**

In [205]:
#Fetching the section that we will scrape from:
weather_data = result.find_all('div', {'class': 'page-content content-module'})

In [206]:
#Looping through the weather_data and appending the Time and Temperature lists:
for item in weather_data:
    for time in item.find_all('div', {'class': 'half-day-card-header__title'}):
        Time.append(time.find('h2', {'class': 'title'}).string)
        
    for temp in item.find_all('div', {'class': 'weather'}):
        Temperature.append(temp.find('div', {'class': 'temperature'}).text.replace('\n', '').replace('\t', ''))

In [207]:
#Based on the complexity of retrieving data for the ReelFeel using HTML, I am using XPATH which is a simpler approach:
Date.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[1]/div').text) # date matching the day
Date.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[1]/div').text) #date matching the night

Max_UV_Index.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/p[1]/span').text)
Max_UV_Index.append('') #appending null since this data is not relevant at night

Wind.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]/p[2]/span').text) #during day
Wind.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[4]/div[2]/div[2]/div[1]/p[1]/span').text) #during night

Probability_of_Thunderstorms.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[2]/p[1]/span').text) #during day
Probability_of_Thunderstorms.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[4]/div[2]/div[2]/div[2]/p[1]/span').text) #during night

Cloud_Cover.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[2]/div[2]/div[2]/p[3]/span').text) #during day
Cloud_Cover.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[4]/div[2]/div[2]/div[2]/p[3]/span').text) #during night

ReelFeel_Temperature.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div[2]/div[1]').text) #during day
ReelFeel_Temperature.append(webpage.find_element(By.XPATH,'/html/body/div/div[7]/div[1]/div[1]/div[4]/div[1]/div[2]/div[2]/div').text) #during night

ReelFeel_Temperature_Shade.append(webpage.find_element(By.XPATH, '/html/body/div/div[7]/div[1]/div[1]/div[2]/div[1]/div[2]/div[2]/div[2]/div').text)
ReelFeel_Temperature_Shade.append('') #appending null since this data is not relevant at night

#### **Wrangling and Showing the created lists**

In [208]:
Date

['TUESDAY, 9 MAY', 'TUESDAY, 9 MAY']

In [209]:
Time

['Day', 'Night']

In [210]:
Temperature

['37°Hi', '27°Lo']

In [211]:
ReelFeel_Temperature

['RealFeel® Sun 40°', 'RealFeel® 28°']

In [212]:
#Cleaning the ReelFeel_Temperature list by keeping only the temperature measure:
ReelFeel = []
for i in ReelFeel_Temperature:
    ReelFeel.append(i.split(' ')[-1])

In [213]:
ReelFeel

['40°', '28°']

In [214]:
ReelFeel_Temperature_Shade

['RealFeel Shade™ 36°', '']

In [215]:
#Cleaning the ReelFeel_Temperature_Shade list by keeping only the temperature measure:
ReelFeel_Shade = []
for i in ReelFeel_Temperature_Shade:
    ReelFeel_Shade.append(i.split(' ')[-1])

In [216]:
ReelFeel_Shade

['36°', '']

In [217]:
Max_UV_Index

['12 Very Unhealthy', '']

In [218]:
#Splitting the Max_UV_Index list and creating an additional list that holds the severity of the UV index:
UV_Index = []
UV_Index_Severity = []
for i in Max_UV_Index:
    UV_Index.append(i.split(' ')[0])
    UV_Index_Severity.append(i[2:].lstrip(' '))                           

In [219]:
UV_Index

['12', '']

In [220]:
UV_Index_Severity

['Very Unhealthy', '']

In [221]:
Wind

['WSW 13 km/h', 'ENE 6 km/h']

In [222]:
Probability_of_Thunderstorms

['0%', '0%']

In [223]:
Cloud_Cover

['0%', '0%']

**Creating a dataframe out of the lists**

In [224]:
weather_df_day = pd.DataFrame({'Date': Date, 'Time': Time, 'Temperature': Temperature, 'ReelFeel': ReelFeel, 'ReelFeel_Shade': ReelFeel_Shade, 'UV_Index': UV_Index, \
                                 'UV_Index_Severity': UV_Index_Severity, 'Wind': Wind, 'Probability_of_Thunderstorms': Probability_of_Thunderstorms, 'Cloud_Cover': Cloud_Cover})

In [225]:
weather_df_day

Unnamed: 0,Date,Time,Temperature,ReelFeel,ReelFeel_Shade,UV_Index,UV_Index_Severity,Wind,Probability_of_Thunderstorms,Cloud_Cover
0,"TUESDAY, 9 MAY",Day,37°Hi,40°,36°,12.0,Very Unhealthy,WSW 13 km/h,0%,0%
1,"TUESDAY, 9 MAY",Night,27°Lo,28°,,,,ENE 6 km/h,0%,0%


**Converting the Dataframe into CSV Format and Storing it into the Local Machine**

In [227]:
weather_df_day.to_csv(r"C:\Users\cesar\xxxx\weather_day6.csv", index=False)

# Sinking the Streamed CSV Files into a PostgreSQL table using COPY

#### **Connecting to PostgreSQL Database (<span style="color:blue">this section should run only once within the pipeline</span>)**

In [44]:
#Importing the necessary libraries in order to connect to the Database:
import psycopg2
from sqlalchemy import create_engine

In [4]:
#Establishing the connection to the Database:
conn = psycopg2.connect(host='127.0.0.1',
        port=5432,
        database='xxxx',
        user='xxxx',
        password='xxxxx',
        connect_timeout=3)

In [46]:
#Creating the engine that will be responsible to execute sql queries in the Database:
engine = create_engine('postgresql://postgres:CesarHanna@127.0.0.1:5432/dbcesar')

In [48]:
#Creating the table in the Database using ONLY the first scraped dataframe - this approach is needed to avoid creating the table manually directly into the Database:
weather_df_day.to_sql('daily_weather_streaming', engine, schema='public', index=False, if_exists='replace')

2

In [195]:
#Committing the changes to the Databse:
conn.commit()

#### **Copying the streamed Dataframes one by one into the Database**

In [230]:
#Writing the datafrmae/file into Postgres using autocommit:
from sqlalchemy.sql import text

write_df = engine.execute(text('''copy public.daily_weather_streaming ("Date", "Time", "Temperature", "ReelFeel", "ReelFeel_Shade", "UV_Index", "UV_Index_Severity", "Wind", "Probability_of_Thunderstorms", "Cloud_Cover") 
                                from 'C:\\xxxx\\weather_day6.csv' CSV HEADER;''').execution_options(autocommit=True))