In [1]:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup
import requests
import time
import datetime as dt 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas

import pickle
import os
import sys
import warnings

plt.style.use('ggplot')
warnings.filterwarnings('ignore')
pd.options.display.max_columns=200


headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
path=r"C:\Users\berid\python\Shell Stations Project"

In [2]:
from selenium.webdriver.edge.options import Options

edge_options = Options()
edge_options.add_argument('--headless')
edge_options.add_argument("--blink-settings=imagesEnabled=false")  # Disable images

driver = webdriver.Edge(options=edge_options)
driver.maximize_window()

In [3]:
url='https://find.pshell.com/'
driver.get(url)

In [4]:
country_dict={}
country_elements=driver.find_elements(By.CSS_SELECTOR,'main[class="locations-list"] li[class="geographical-list-item"] a[class="geographical-list-item__link"]')
for country_element in country_elements:
    country_url=country_element.get_attribute('href')
    country_name=country_element.get_attribute('textContent').split('(')[0].strip()
    country_dict[country_name]=country_url
country_dict
    

{'Argentina': 'https://find.shell.com/ar/fuel/locations',
 'Australia': 'https://find.shell.com/au/fuel/locations',
 'Austria': 'https://find.shell.com/at/fuel/locations',
 'Belgium': 'https://find.shell.com/be/fuel/locations',
 'Botswana': 'https://find.shell.com/bw/fuel/locations',
 'Brazil': 'https://find.shell.com/br/fuel/locations',
 'Bulgaria': 'https://find.shell.com/bg/fuel/locations',
 'Canada': 'https://find.shell.com/ca/fuel/locations',
 'Colombia': 'https://find.shell.com/co/fuel/locations',
 'Czechia': 'https://find.shell.com/cz/fuel/locations',
 'Denmark': 'https://find.shell.com/dk/fuel/locations',
 'Dominican Republic': 'https://find.shell.com/do/fuel/locations',
 'Ecuador': 'https://find.shell.com/ec/fuel/locations',
 'El Salvador': 'https://find.shell.com/sv/fuel/locations',
 'France': 'https://find.shell.com/fr/fuel/locations',
 'Germany': 'https://find.shell.com/de/fuel/locations',
 'Honduras': 'https://find.shell.com/hn/fuel/locations',
 'Hong Kong': 'https://find.

In [5]:
def return_state_dict(country_url):
    state_dict={}

    driver.get(country_url)
    #time.sleep(1)

    state_elements=driver.find_elements(By.CSS_SELECTOR,'main[class="locations-list"] li[class="geographical-list-item"] a[class="geographical-list-item__link"]')
    for i,state_element in enumerate(state_elements,start=1):
        try:
            state_url=state_element.get_attribute('href')
            state_name=state_element.get_attribute('textContent').split('(')[0].strip()
            state_name=f'{i}_{state_name}'
            state_dict[state_name]=state_url
        except:
            continue

    return state_dict

In [6]:
def return_city_dict(country_url):
    city_dict={}

    driver.get(country_url)
    #time.sleep(1)

    city_elements=driver.find_elements(By.CSS_SELECTOR,'main[class="locations-list"] li[class="geographical-list-item"] a[class="geographical-list-item__link"]')
    for i,city_element in enumerate(city_elements,start=1):
        try:
            city_url=city_element.get_attribute('href')
            city_name=city_element.get_attribute('textContent').split('(')[0].strip()
            city_name=f'{i}_{city_name}'
            city_dict[city_name]=city_url
        except:
            continue
    
    
    return city_dict

In [7]:
def return_station_url(city_url):

    driver.get(city_url)
    #time.sleep(1)

    station_elements=driver.find_elements(By.CSS_SELECTOR,'main[class="locations-list"] li[class="station-list-item"] a:nth-child(1)')
    station_urls=[]
    for station_element in station_elements:
        try:
            station_url=station_element.get_attribute('href')
            station_urls.append(station_url)
        except:
            continue
    
    return station_urls

In [8]:
scraped_countries=[file.split('_')[0] for file in os.listdir(os.path.join(path,'country_urls')) if '_URLs' in file]
scraped_countries

['all',
 'Argentina',
 'Australia',
 'Austria',
 'Belgium',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Canada',
 'Colombia',
 'Czechia',
 'Denmark',
 'Dominican Republic',
 'Ecuador',
 'El Salvador',
 'France',
 'Germany',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'India',
 'Indonesia',
 'Italy',
 'Kenya',
 'Luxembourg',
 'Macao',
 'Malaysia',
 'Mauritius',
 'Mexico',
 'Morocco',
 'Netherlands',
 'Norway',
 'Oman',
 'Pakistan',
 'Philippines',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Singapore',
 'Slovakia',
 'Slovenia',
 'South Africa',
 'Spain',
 'Switzerland',
 'Thailand',
 'Tunisia',
 'Türkiye',
 'Uganda',
 'United Kingdom',
 'United States',
 'USA1',
 'USA2']

In [9]:
all_station_urls=[]

for country_name, country_url in country_dict.items():
    if country_name in scraped_countries:
        continue

    driver.get(country_url)

    if driver.find_element(By.CSS_SELECTOR,'main[class="locations-list"] h2[class="locations-list__heading"]').text.replace('\n','').strip()=='States':
        
        state_dict=return_state_dict(country_url)
        for state_name, state_url in state_dict.items():
            city_dict=return_city_dict(state_url)
            for city_name, city_url in city_dict.items():
                station_urls=return_station_url(city_url)
                all_station_urls.append(station_urls)
                
    elif driver.find_element(By.CSS_SELECTOR,'main[class="locations-list"] h2[class="locations-list__heading"]').text.replace('\n','').strip()=='Cities':
        city_dict=return_city_dict(country_url)
        for city_name, city_url in city_dict.items():
            station_urls=return_station_url(city_url)
            all_station_urls.append(station_urls)
    else:
        station_urls=return_station_url(country_url)
        all_station_urls.append(station_urls)

    pickle.dump(all_station_urls,open(os.path.join(path,country_urls,f'{country_name}_URLs.pickle'),'wb'))
    all_station_urls=[]
    print(country_name,end='\r')

In [10]:
all_urls=[]

for file in os.listdir(os.path.join(path,'country_urls')):
    if '_URLs' in file:
        url_lists=pickle.load(open(os.path.join(path,'country_urls',file),'rb'))
        for url_list in url_lists:
            for url in url_list:
                all_urls.append(url)


all_urls=list(set(all_urls))
all_urls=[url for url in all_urls if 'https' in str(url)]

pickle.dump(all_urls,open(os.path.join(path,'country_urls',f'all_URLs.pickle'),'wb'))

len(all_urls)



37964

In [11]:
all_urls=pickle.load(open(os.path.join(path,'country_urls', f'all_URLs.pickle'),'rb'))
all_urls=sorted(all_urls)

In [12]:
len(all_urls)

37964

In [13]:
def return_dict(url):
    driver.get(url)
    html=driver.page_source
    soup=BeautifulSoup(html,'html.parser')

    try:
        fuels='New Line'.join([i.get_text(separator='|') for i in soup.select('table[class="station-page-fuel-prices__table"] tr')])
    except:
        fuels=None

    try:
        ev_charging=soup.select_one('section[class="section-with-title station-page-ev-charging"]').get_text(separator='|')
    except:
        ev_charging=None

    try:
        hydrogen=soup.select_one('section[class="section-with-title section-with-title--hydrogen"]').get_text(separator='|')
    except:
        hydrogen=None

    try:
        opening_hours='New Line'.join([i.get_text(separator='|') for i in soup.select('table[class="opening-hours__table opening-hours__table--single"] tr')])
    except:
        opening_hours=None

    try:
        services='|'.join([i.text for i in soup.select('section[id="features"] li')])
    except:
        services=None

    try:
        more_at_location='|'.join([i.text for i in soup.select('section[id="more_at_location"] li')])
    except:
        more_at_location=None

    try:
        about_the_station=soup.select_one('article[class="station-page-about__text"]').text
    except:
        about_the_station=None

    try:
        address=soup.select_one('section[id="details"]').text
    except:
        address=None

    try:
        country=soup.select_one('div[class="breadcrumbs__links"]').get_text(separator='|')
    except:
        country=None

    dict={'URL':url,'Country, State, City':country,'Fuels':fuels,'EV':ev_charging,'Hydrogen':hydrogen,'Opening Hours':opening_hours,'Services':services,'More At Location':more_at_location,'About':about_the_station,'Address':address}

    return dict

In [14]:
all_stations_data = []
for file in os.listdir(os.path.join(path, 'all_data')):
    if 'all_' in file:
        data = pickle.load(open(os.path.join(path, 'all_data', file), 'rb'))
        for item in data:
            all_stations_data.append(item)

all_stations_data=[dict for dict in all_stations_data if dict['Address']]


# Filter entries based on 'Address' and remove duplicates based on 'URL'
unique_urls = set()
filtered_data = []
for entry in all_stations_data:
    if 'URL' in entry and entry['URL'] not in unique_urls:
        filtered_data.append(entry)
        unique_urls.add(entry['URL'])

# Save the filtered data
pickle.dump(filtered_data, open(os.path.join(path, 'all_data', 'all_stations_data.pickle'), 'wb'))

# Print the number of unique URLs scraped
scraped_urls = [entry['URL'] for entry in filtered_data]
print(f'Number of URLs scraped: {len(scraped_urls)}')


Number of URLs scraped: 37955


In [15]:
urls_to_be_scraped=[url for url in all_urls if url not in scraped_urls]
len(urls_to_be_scraped)

9

In [16]:
all_data=[]

for i,url in enumerate(urls_to_be_scraped,start=1):

    #if url in scraped_urls:
    #    continue

    try:
        dict=return_dict(url)
        all_data.append(dict)
    except:
        continue

    if i%1000==0 or i==len(urls_to_be_scraped):
        pickle.dump(all_data,open(os.path.join(path,'all_data',f'all_data_{i}'),'wb'))
        all_data=[]

    print(f'URLs Scraped : {i}',end='\r')

URLs Scraped : 9

# Cleaning

In [17]:
df=pd.DataFrame(filtered_data)
df.to_csv(os.path.join(path,'Shell_Gas_Stations (raw).csv'),index=False)

In [18]:
df['Lat']=df['Address'].apply(lambda x: x.split('Lat / Lng')[-1].split('|')[-1].split(',')[0] if isinstance(x,str) else x).apply(pd.to_numeric,errors='coerce')
df['Lon']=df['Address'].apply(lambda x: x.split('Lat / Lng')[-1].split('|')[-1].split(',')[1] if isinstance(x,str) else x).apply(pd.to_numeric,errors='coerce')

df=df.drop(columns='Address')

In [19]:
country=df['Country, State, City'].apply(lambda x:x.split('|')[1].strip() if isinstance(x,str) else None)
state=df['Country, State, City'].apply(lambda x:x.split('|')[2].strip() if isinstance(x,str) and len(x.split('|'))==5 else None)
city=df['Country, State, City'].apply(lambda x:x.split('|')[3].strip() if isinstance(x,str) and len(x.split('|'))==5 else 
                                 x.split('|')[2].strip() if isinstance(x,str) and len(x.split('|'))==4 else None)
station=df['Country, State, City'].apply(lambda x:x.split('|')[-1].strip() if isinstance(x,str) else None)

df.insert(df.columns.get_loc('Country, State, City')+1,'Country',country)
df.insert(df.columns.get_loc('Country, State, City')+2,'State',state)
df.insert(df.columns.get_loc('Country, State, City')+3,'City',city)
df.insert(df.columns.get_loc('Country, State, City')+4,'Station',station)

df=df.drop(columns='Country, State, City')

In [20]:
fuels=df['Fuels'].apply(lambda x:x.split('New Line') if 'New Line' in str(x) else x.split('|')[0] if 'New Line' not in str(x) else None)

new_fuel_series=[]
for list in fuels:
    new_list=[]
    for value in list:
        fuel_name=value.split('|')[0].strip()
        fuel_price=value.split('|')[-1].strip()
        dict={'Fuel':fuel_name,'Price':fuel_price}
        new_list.append(dict)
    new_fuel_series.append(new_list)

df['Fuels']=new_fuel_series

In [21]:
ev_pints=df['EV'].apply(lambda x:
                                 0 if 'No EV charging information available' in str(x) else
                                 x.split('|')[1].split(' ')[0].strip() if x else None
                                 ).apply(pd.to_numeric,errors='coerce')
df.insert(df.columns.get_loc('EV')+1,'EV Charging Points',ev_pints)
df=df.drop(columns='EV')


In [22]:
df['Hydrogen']=df['Hydrogen'].apply(lambda x:'No' if 'No Hydrogen information available' in str(x) else 'Yes')

In [23]:
hours=df['Opening Hours'].apply(lambda x:x.replace('Forecourt','').split('New Line')[1:] if 'New Line' in str(x) else x.split('|')[0])

from datetime import datetime, timedelta
time_format = "%H:%M"


new_hour_series=[]
for list in hours:
    new_list=[]
    for value in list:
        weekday=value.split('|')[0].strip()
        try:
            hours=value.split('|')[-1].strip()
            start_hour=hours.split('-')[0].strip()
            start_hour=datetime.strptime(start_hour, time_format)
            end_hour=hours.split('-')[1].strip()
            end_hour=datetime.strptime(end_hour, time_format)

            if end_hour < start_hour:
                end_hour += timedelta(days=1)
            difference = round((end_hour - start_hour).total_seconds() / 3600)
        except:
            difference=None


        dict={'Weekday':weekday,'Hours':difference}
        new_list.append(dict)

    new_hour_series.append(new_list)
    
df['Opening Hours']=new_hour_series

In [24]:
df['Services']=df['Services'].apply(lambda x:'|'.join(sorted([i.strip() for i in x.split('|')])))

In [25]:
df['More At Location']=df['More At Location'].apply(lambda x:'|'.join(sorted([i.strip() for i in x.split('|')])))

In [26]:
df=df.drop(columns='About')

In [27]:
#df.to_csv(os.path.join(path,'Shell_Gas_Stations (cleaned).csv'),index=False)

In [28]:
df=pd.read_csv("Shell_Gas_Stations (cleaned).csv")