In [5]:

# Let's try to find the cheapest gas closest to where we live. We can screen scrape this data from Google Maps.
# In the process we will do some data cleaning: we will polish up messy addresses in a dataframe using Python and regular expressions 
# to extract relevant information.

# We will also do some geocoding: we will use geocoding to calculate distances from a home location ("4705 Center Blvd") to various cleaned 
# addresses using the Google Maps API.

# Finally, we will do some metric calculation: we will devise a metric to rank gas stations based on both distance and price. This involves
# normalizing prices, calculating a score combining distance and price, and ranking gas stations accordingly. The cheapest gas may be the closest
# to our home, but it almost certinly will NOT be. So, we will figure out if it's worth it to drive further to save a little more money.


In [3]:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
import regex as re
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
import pandas as pd
import googlemaps

# Set up Chrome WebDriver
driver = webdriver.Chrome()

# URL of Google Maps with gas prices.. The link looks a little weird however this is because we are sorting the results by 
# distance to get the closest stations by default. It makes no sense whatsoever to drive several miles further to save a penny on gas.
url = "https://www.google.com/maps/search/gas+prices/@40.7372786,-73.9701557,13z/data=!4m4!2m3!5m1!10e2!6e2?entry=ttu"

# Open the URL in the browser
driver.get(url)


# Wait for the page to load completely
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "bfdHYd")))

# Define the function to scroll down
def scroll_down(driver, scroll_times=10, pause_time=2):
    # Initialize ActionChains
    actions = ActionChains(driver)
    
    for _ in range(scroll_times):
        # Perform the scroll action
        actions.send_keys(Keys.PAGE_DOWN).perform()
        
        # Wait for new content to load
        time.sleep(pause_time)

# Scroll down to load more prices
scroll_down(driver, scroll_times=8)  # Adjust the number of scrolls as needed

# Get the page source after scrolling
page_source = driver.page_source


# Parse HTML with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')


# Find all gas station entries
gas_stations = soup.find_all('div', class_='bfdHYd')

data = []

# Function to extract addresses
def extract_address(elements):
    for element in elements:
        text = element.get_text(separator=" ", strip=True)
        if "Gas station" in text:
            address_parts = text.split("Gas station")
            if len(address_parts) > 1:
                address = address_parts[1].strip()
                address = " ".join(address.split(" · ")[:2]).strip()
                return address
    return "N/A"

# Iterate over each gas station entry
for station in gas_stations:
    try:
        # Extract gas station name
        name_element = station.find('div', class_='qBF1Pd')
        name = name_element.text.strip() if name_element else "N/A"

        # Extract gas station address
        address_elements = station.find_all('div', class_='W4Efsd')
        address = extract_address(address_elements)

        # Extract gas station price
        price_element = station.find('div', class_='ah5Ghc')
        price = price_element.find_next('span').text.strip() if price_element else "N/A"

        data.append([name, address, price])

    except Exception as e:
        print(f"Error: {e}")

driver.quit()


In [4]:

# Create a DataFrame
df = pd.DataFrame(data, columns=['Gas Station', 'Address', 'Price'])

# Let's drop NAs, because these gas stations don't have prices
print(df.shape)
df = df[df['Price'] != 'N/A']

# Also, clean some dirty data
df = df[df['Price'] != 'In-store shopping']
print(df.shape)


# Function to clean up the addresses
def clean_address(address):
    # Use regular expression to find the address part
    match = re.search(r'^[^a-zA-Z\d]*(.*?)(Open|Open now)', address)
    if match:
        return match.group(1).strip()
    else:
        return None
    

# Apply the cleaning function to the 'addresses' column
# Web data is notoriously dirty, so we need to do some cleaning at this point.
df['Cleaned_Address'] = df['Address'].apply(clean_address)
df


(40, 3)
(30, 3)


Unnamed: 0,Gas Station,Address,Price,Cleaned_Address
1,bp,· 2117 Jackson Ave Open 24 hours (718) 361-8848,$3.60,2117 Jackson Ave
2,bp,· 256 McGuinness Blvd Open 24 hours (718) 383-...,$3.60,256 McGuinness Blvd
4,Sonomax,· 278 Greenpoint Ave Open 24 hours (718) 389-9782,$3.62,278 Greenpoint Ave
5,Conoco,·  176 McGuinness Blvd Open 24 hours,$3.58,176 McGuinness Blvd
6,Global Gas 49-25 Van Dam,· 49-25 Van Dam St Open now (718) 392-2348,$3.50,49-25 Van Dam St
7,Citgo,· 36-20 Queens Blvd Open 24 hours (718) 729-9274,$3.60,36-20 Queens Blvd
11,Gulf,·  39-04 Skillman Ave Open 24 hours,$3.60,39-04 Skillman Ave
12,Citgo,· 3602 21st St Closes soon ⋅ 6 PM ⋅ Opens 9 AM...,$3.50,3602 21st St Closes soon ⋅ 6 PM ⋅
13,LUKOIL,·  34-02 31st St Open 24 hours,$3.60,34-02 31st St
15,Citgo,· 56-02 Broadway Open ⋅ Closes 8 PM (718) 478-...,$3.50,56-02 Broadway


In [5]:

# Initialize Google Maps API client
gmaps = googlemaps.Client(key='AIzaSyBzK5pYcr0W2C3TEdSWCE7AKCHSpFrGFqg')

# Function to geocode addresses and calculate distance
def calculate_distance(cleaned_address):
    try:
        # Geocode home address
        home_geocode = gmaps.geocode("4705 Center Blvd, Long Island City, NY 11109")

        # Geocode cleaned address
        cleaned_geocode = gmaps.geocode(cleaned_address + ", New York, USA")

        if home_geocode and cleaned_geocode:
            # Extract latitude and longitude
            home_coords = (home_geocode[0]['geometry']['location']['lat'], home_geocode[0]['geometry']['location']['lng'])
            cleaned_coords = (cleaned_geocode[0]['geometry']['location']['lat'], cleaned_geocode[0]['geometry']['location']['lng'])

            # Calculate distance
            return geodesic(home_coords, cleaned_coords).miles  # Distance in miles
        else:
            return None
    except Exception as e:
        print(f"Error geocoding address: {e}")
        return None


# Apply the function to calculate distance and create a new column
df['Distance_to_Home'] = df['Cleaned_Address'].apply(calculate_distance)

df=df.sort_values(by=['Price'], ascending=True)
df


Error geocoding address: unsupported operand type(s) for +: 'NoneType' and 'str'


Unnamed: 0,Gas Station,Address,Price,Cleaned_Address,Distance_to_Home
33,Conoco,·  4901 Northern Blvd Open ⋅ Closes 11 PM,$3.44,4901 Northern Blvd,2.346097
35,Gas Go Petroleum Inc,· 5035 Northern Blvd,$3.48,,
36,Mobil,· 50-92 Northern Blvd Open 24 hours (718) 205-...,$3.50,50-92 Northern Blvd,2.447081
6,Global Gas 49-25 Van Dam,· 49-25 Van Dam St Open now (718) 392-2348,$3.50,49-25 Van Dam St,1.249355
12,Citgo,· 3602 21st St Closes soon ⋅ 6 PM ⋅ Opens 9 AM...,$3.50,3602 21st St Closes soon ⋅ 6 PM ⋅,11.198756
15,Citgo,· 56-02 Broadway Open ⋅ Closes 8 PM (718) 478-...,$3.50,56-02 Broadway,9.616207
28,Citgo,· 3602 21st St Closes soon ⋅ 6 PM ⋅ Opens 9 AM...,$3.50,3602 21st St Closes soon ⋅ 6 PM ⋅,11.198756
32,CONOCO 24-Hr. Full-Service,· 451 Lorimer St Open 24 hours (718) 384-4880,$3.54,451 Lorimer St,2.469891
17,Husky Gas Station,· 3102 68th St Open 24 hours (718) 457-1003,$3.54,3102 68th St,3.103201
30,Mobil,· 49-21 Queens Blvd Open 24 hours (718) 457-2643,$3.56,49-21 Queens Blvd,2.176779


In [None]:

# Our findings show that some gas stations can be closer to us, but the gas is more expensive, or further away but 
# the gas is cheaper. So, let's answer the question of whether it is worth it to drive further to save some money on gas.

# To help you choose the best gas station based on both distance and price, you can calculate a combined metric 
# that balances these factors. One common approach is to use a weighted sum or a score that considers both distance 
# and price. 
    
# First we need to nNormalize the prices so that they are on the same scale as distances (which are typically 
# in miles). You can use min-max normalization for this purpose.

# Next, we need to define our metric for ranking gas stations (based on affordability and distance). One simple approach is to calculate 
# a score where lower scores indicate better options (cheaper price and closer distance).

# Finally, we will apply ranking/scoring and based on this metric to identify the best option.


In [10]:

# Normalize Price function
def normalize_price(price_str):
    return float(price_str.replace('$', ''))

# Calculate normalized price
df['Normalized_Price'] = df['Price'].apply(normalize_price)

# Define a metric (e.g., lower score is better)
# In this example, let's use a simple metric: Distance * Normalized Price
df['Score'] = df['Distance_to_Home'] * df['Normalized_Price']


# Rank gas stations by score (lower is better)
df = df.sort_values(by='Score')
display(df)


Unnamed: 0,Gas Station,Address,Price,Cleaned_Address,Distance_to_Home,Normalized_Price,Score
1,bp,· 2117 Jackson Ave Open 24 hours (718) 361-8848,$3.60,2117 Jackson Ave,0.450708,3.6,1.622547
2,bp,· 256 McGuinness Blvd Open 24 hours (718) 383-...,$3.60,256 McGuinness Blvd,1.068038,3.6,3.844938
4,Sonomax,· 278 Greenpoint Ave Open 24 hours (718) 389-9782,$3.62,278 Greenpoint Ave,1.154247,3.62,4.178373
6,Global Gas 49-25 Van Dam,· 49-25 Van Dam St Open now (718) 392-2348,$3.50,49-25 Van Dam St,1.249355,3.5,4.372743
5,Conoco,·  176 McGuinness Blvd Open 24 hours,$3.58,176 McGuinness Blvd,1.242762,3.58,4.449087
21,Quality Fuel & Go,· 3745 21st St Open 24 hours (845) 244-6601,$3.60,3745 21st St,1.241603,3.6,4.469771
26,bp,· 36-42 21st St Open 24 hours (718) 392-1183,$3.86,36-42 21st St,1.330875,3.86,5.137178
7,Citgo,· 36-20 Queens Blvd Open 24 hours (718) 729-9274,$3.60,36-20 Queens Blvd,1.483227,3.6,5.339616
22,Shell,· 3417 Northern Blvd Open 24 hours (917) 846-7195,$3.80,3417 Northern Blvd,1.489555,3.8,5.660311
20,Speedway,· 3902 Queens Blvd Gas-station convenience sto...,$3.60,3902 Queens Blvd Gas-station convenience store,1.616846,3.6,5.820645


In [None]:

# END!!! 
