In [None]:
pip install geopandas


In [None]:
pip install scikit-learn

In [None]:
pip install geopy


In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
import ipywidgets as widgets
from IPython.display import display
import folium
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster


In [13]:
import pandas as pd

# Load the dataset from the Final Datasets folder
file_path = 'Final Datasets/N1_Cleaned_fueldata.csv'
data = pd.read_csv(file_path)

# Display the first few rows and summary information
data_info = data.info()
data_head = data.head()

data_info, data_head


# Putting data into another variable
data_cleaned = data


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 972174 entries, 0 to 972173
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   PUBLISH_DATE         972174 non-null  object 
 1   TRADING_NAME         972174 non-null  object 
 2   BRAND_DESCRIPTION    972174 non-null  object 
 3   PRODUCT_DESCRIPTION  972174 non-null  object 
 4   PRODUCT_PRICE        972174 non-null  float64
 5   ADDRESS              972174 non-null  object 
 6   LOCATION             972174 non-null  object 
 7   POSTCODE             972174 non-null  int64  
 8   AREA_DESCRIPTION     972174 non-null  object 
 9   REGION_DESCRIPTION   972174 non-null  object 
 10  latitude             972174 non-null  float64
 11  longitude            972174 non-null  float64
dtypes: float64(3), int64(1), object(8)
memory usage: 89.0+ MB


# Cluster Analysis
### 	Hypothesis : More fuel stations in one area will  lead to higher fuel prices – Accepted

In [14]:
# Convert 'PUBLISH_DATE' to datetime format
data_cleaned['PUBLISH_DATE'] = pd.to_datetime(data_cleaned['PUBLISH_DATE'], errors='coerce')

# Prepare the dropdown options for years (including 'All Years')
years = sorted(data_cleaned['PUBLISH_DATE'].dt.year.unique().tolist())
years.append('All Years')

# Filter data based on 'AREA_DESCRIPTION' column
north_data = data_cleaned[data_cleaned['AREA_DESCRIPTION'] == 'North of River']
south_data = data_cleaned[data_cleaned['AREA_DESCRIPTION'] == 'South of River']
east_data = data_cleaned[data_cleaned['AREA_DESCRIPTION'] == 'East/Hills']

# Group data by latitude, longitude, and year
north_location_prices = north_data.groupby(['latitude', 'longitude', north_data['PUBLISH_DATE'].dt.year])['PRODUCT_PRICE'].mean().reset_index()
south_location_prices = south_data.groupby(['latitude', 'longitude', south_data['PUBLISH_DATE'].dt.year])['PRODUCT_PRICE'].mean().reset_index()
east_location_prices = east_data.groupby(['latitude', 'longitude', east_data['PUBLISH_DATE'].dt.year])['PRODUCT_PRICE'].mean().reset_index()

# Function to create clustering map for a region and year
def generate_cluster_map(region, selected_year):
    if region == 'North of River':
        location_prices = north_location_prices
    elif region == 'South of River':
        location_prices = south_location_prices
    else:
        location_prices = east_location_prices
    
    # Filter data for the selected year or all years
    if selected_year != 'All Years':
        location_prices = location_prices[location_prices['PUBLISH_DATE'] == selected_year]
        avg_price = location_prices['PRODUCT_PRICE'].mean()
    else:
        avg_price = location_prices['PRODUCT_PRICE'].mean()

    # Create a map centered on Perth
    region_map = folium.Map(location=[-31.9505, 115.8605], zoom_start=10)

    # Add a marker cluster to group nearby stations
    marker_cluster = MarkerCluster().add_to(region_map)

    # Add markers to the cluster, colored by whether the station charges more than the regional average
    for index, row in location_prices.iterrows():
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=f"Price: {row['PRODUCT_PRICE']} cents",
            icon=folium.Icon(color='red' if row['PRODUCT_PRICE'] > avg_price else 'green')
        ).add_to(marker_cluster)

    # Save the map to an HTML file
    region_safe = f"Heat Map Data/{region}_heatmap_{selected_year}.html"
    region_map.save(region_safe)
    
    return region_map

# Create dropdowns for region and year selection
region_dropdown = widgets.Dropdown(
    options=['North of River', 'South of River', 'East/Hills'],
    value='North of River',
    description='Select Region:',
    disabled=False,
)

year_dropdown = widgets.Dropdown(
    options=years,
    value='All Years',
    description='Select Year:',
    disabled=False,
)

# Display the dropdowns
display(region_dropdown)
display(year_dropdown)

# Display the map for the selected region and year
output = widgets.Output()

# Define the callback function to update the map based on dropdown selection
def update_cluster_map(change):
    with output:
        output.clear_output()  # Clear previous map
        region_map = generate_cluster_map(region_dropdown.value, year_dropdown.value)
        display(region_map)

# Attach the callback to the dropdowns
region_dropdown.observe(update_cluster_map, names='value')
year_dropdown.observe(update_cluster_map, names='value')

# Display the initial map
with output:
    display(generate_cluster_map('North of River', 'All Years'))

display(output)


Dropdown(description='Select Region:', options=('North of River', 'South of River', 'East/Hills'), value='Nort…

Dropdown(description='Select Year:', index=5, options=(2020, 2021, 2022, 2023, 2024, 'All Years'), value='All …

Output()

# Heat Map of Areas

In [None]:

# Convert 'PUBLISH_DATE' to datetime format
data_cleaned['PUBLISH_DATE'] = pd.to_datetime(data_cleaned['PUBLISH_DATE'], errors='coerce')

# Prepare the dropdown options for years (including 'All Years')
years = sorted(data_cleaned['PUBLISH_DATE'].dt.year.unique().tolist())
years.append('All Years')

# Filter data based on 'AREA_DESCRIPTION' column
north_data = data_cleaned[data_cleaned['AREA_DESCRIPTION'] == 'North of River']
south_data = data_cleaned[data_cleaned['AREA_DESCRIPTION'] == 'South of River']
east_data = data_cleaned[data_cleaned['AREA_DESCRIPTION'] == 'East/Hills']

# Group data by latitude, longitude, and year
north_location_prices = north_data.groupby(['latitude', 'longitude', north_data['PUBLISH_DATE'].dt.year])['PRODUCT_PRICE'].mean().reset_index()
south_location_prices = south_data.groupby(['latitude', 'longitude', south_data['PUBLISH_DATE'].dt.year])['PRODUCT_PRICE'].mean().reset_index()
east_location_prices = east_data.groupby(['latitude', 'longitude', east_data['PUBLISH_DATE'].dt.year])['PRODUCT_PRICE'].mean().reset_index()

# Function to generate heatmap for a region and year
def generate_heatmap(region, selected_year):
    if region == 'North of River':
        location_prices = north_location_prices
    elif region == 'South of River':
        location_prices = south_location_prices
    else:
        location_prices = east_location_prices
    
    # Filter data for the selected year or use all years
    if selected_year != 'All Years':
        location_prices = location_prices[location_prices['PUBLISH_DATE'] == selected_year]

    # Create a map centered on Perth
    region_map = folium.Map(location=[-31.9505, 115.8605], zoom_start=10)

    # Prepare data for heatmap
    heat_data = [[row['latitude'], row['longitude']] for index, row in location_prices.iterrows()]

    # Add heatmap layer
    HeatMap(heat_data).add_to(region_map)

    # Save the map to an HTML file
    region_safe = f"Heat Map Data/{region}_heatmap_{selected_year}.html"
    region_map.save(region_safe)

    return region_map

# Create dropdowns for region and year selection
region_dropdown = widgets.Dropdown(
    options=['North of River', 'South of River', 'East/Hills'],
    value='North of River',
    description='Select Region:',
    disabled=False,
)

year_dropdown = widgets.Dropdown(
    options=years,
    value='All Years',
    description='Select Year:',
    disabled=False,
)

# Display the dropdowns
display(region_dropdown)
display(year_dropdown)

# Display the heatmap for the selected region and year
output = widgets.Output()

# Define the callback function to update the heatmap based on dropdown selection
def update_heatmap(change):
    with output:
        output.clear_output()  # Clear previous map
        region_map = generate_heatmap(region_dropdown.value, year_dropdown.value)
        display(region_map)

# Attach the callback to the dropdowns
region_dropdown.observe(update_heatmap, names='value')
year_dropdown.observe(update_heatmap, names='value')

# Display the initial heatmap
with output:
    display(generate_heatmap('North of River', 'All Years'))

display(output)


# Station Density vs Price Correlation Analysis

In [15]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd

# Convert latitude and longitude to a GeoDataFrame
def create_geodataframe(region_data):
    geometry = [Point(xy) for xy in zip(region_data['longitude'], region_data['latitude'])]
    gdf = gpd.GeoDataFrame(region_data, geometry=geometry)

    # Set the coordinate system to WGS84 (latitude/longitude)
    gdf.set_crs(epsg=4326, inplace=True)
    
    # Convert to UTM for accurate area calculations (automatic zone detection)
    gdf = gdf.to_crs(gdf.estimate_utm_crs())
    
    return gdf

# Calculate the area using the convex hull (in square kilometers)
def calculate_region_area(gdf):
    convex_hull = gdf.unary_union.convex_hull  # Create the convex hull around the points
    area_sq_km = convex_hull.area / 1e6  # Convert area from square meters to square kilometers
    return area_sq_km

# Create GeoDataFrames for each region
north_gdf = create_geodataframe(north_data)
south_gdf = create_geodataframe(south_data)
east_gdf = create_geodataframe(east_data)

# Calculate the exact area for each region
north_area = calculate_region_area(north_gdf)
south_area = calculate_region_area(south_gdf)
east_area = calculate_region_area(east_gdf)

print(f"Exact area of North of River: {north_area:.2f} km²")
print(f"Exact area of South of River: {south_area:.2f} km²")
print(f"Exact area of East/Hills: {east_area:.2f} km²")

# Now we can use these exact areas to calculate station density
region_areas = {
    'North of River': north_area,
    'South of River': south_area,
    'East/Hills': east_area
}

# Calculate station density (number of stations per square km) and average price for each region
def calculate_density_and_avg_price(region_data, region_name):
    num_stations = region_data[['latitude', 'longitude']].drop_duplicates().shape[0]
    density = num_stations / region_areas[region_name]  # Stations per square kilometer
    avg_price = region_data['PRODUCT_PRICE'].mean()
    return density, avg_price

# Calculate for each region
north_density, north_avg_price = calculate_density_and_avg_price(north_data, 'North of River')
south_density, south_avg_price = calculate_density_and_avg_price(south_data, 'South of River')
east_density, east_avg_price = calculate_density_and_avg_price(east_data, 'East/Hills')

# Create a dataframe to store the results
df_density_price = pd.DataFrame({
    'Region': ['North of River', 'South of River', 'East/Hills'],
    'Density': [north_density, south_density, east_density],
    'Avg_Price': [north_avg_price, south_avg_price, east_avg_price]
})

# Calculate the correlation between density and average price
correlation = df_density_price['Density'].corr(df_density_price['Avg_Price'])
print(f"Correlation between station density and average price: {correlation}")


  convex_hull = gdf.unary_union.convex_hull  # Create the convex hull around the points


Exact area of North of River: 1206.27 km²
Exact area of South of River: 1422.59 km²
Exact area of East/Hills: 1050.81 km²
Correlation between station density and average price: 0.8637151251361079


Positive Correlation:

A correlation close to 1 suggests that there is a strong relationship between higher station density and higher prices.
This is counterintuitive to what many might expect, as more stations typically suggest competition, which should drive prices down. However, in this case, the data suggests the opposite: areas with more stations tend to have higher prices.


Price Control: 
In regions with high fuel station density, stations might be clustering in high-demand areas where consumers are less price-sensitive, allowing them to keep prices higher despite more competition.

High Traffic Areas: 
These regions might be more urban or have higher traffic, where demand remains high, regardless of how many stations there are. Consequently, stations may feel less pressure to lower prices.

Operational Costs: 
Urban or dense areas might also have higher operational costs (e.g., rent, labor), which could be passed on to consumers in the form of higher fuel prices.
Local Factors: There may be local regulations, taxes, or other constraints in more densely populated areas that contribute to higher prices.


In [16]:
import pandas as pd
from scipy.stats import ttest_ind

# Assuming station density has already been calculated
# Define a threshold for high-density vs low-density (e.g., based on median density)
density_threshold = df_density_price['Density'].median()

# Classify regions as high-density or low-density
df_density_price['Density_Class'] = df_density_price['Density'].apply(lambda x: 'High-Density' if x > density_threshold else 'Low-Density')

# Group data by density class and calculate average prices
high_density_prices = df_density_price[df_density_price['Density_Class'] == 'High-Density']['Avg_Price']
low_density_prices = df_density_price[df_density_price['Density_Class'] == 'Low-Density']['Avg_Price']

# Calculate basic statistics
high_density_mean = high_density_prices.mean()
low_density_mean = low_density_prices.mean()

print(f"High-Density Avg Price: {high_density_mean:.2f} cents per litre")
print(f"Low-Density Avg Price: {low_density_mean:.2f} cents per litre")

# Perform a t-test to check if the difference in prices is statistically significant
t_stat, p_value = ttest_ind(high_density_prices, low_density_prices)
print(f"T-statistic: {t_stat}, P-value: {p_value}")


High-Density Avg Price: 162.24 cents per litre
Low-Density Avg Price: 161.34 cents per litre
T-statistic: 1.8012639448622718, P-value: 0.3226392597319173


# Yearly Price Dispersion 

In [17]:
import pandas as pd

# Step 1: Convert PUBLISH_DATE to datetime
data_cleaned['PUBLISH_DATE'] = pd.to_datetime(data_cleaned['PUBLISH_DATE'])

# Step 2: Group by area and year, then calculate the standard deviation of prices and count of stations
price_variation_yearly = data_cleaned.groupby([data_cleaned['PUBLISH_DATE'].dt.year, 'AREA_DESCRIPTION']).agg({
    'PRODUCT_PRICE': ['std', 'mean'],
    'TRADING_NAME': 'nunique'  # Count unique stations by name
}).reset_index()

# Step 3: Rename columns for better readability
price_variation_yearly.columns = ['Year', 'AREA_DESCRIPTION', 'Price_Std_Dev', 'Avg_Price', 'Num_Stations']

# Step 4: Sort by number of stations
price_variation_yearly = price_variation_yearly.sort_values('Year', ascending=True)

# Step 5: Display the yearly data
price_variation_yearly


Unnamed: 0,Year,AREA_DESCRIPTION,Price_Std_Dev,Avg_Price,Num_Stations
0,2020,East/Hills,16.306107,118.432711,54
1,2020,North of River,16.94611,119.486999,166
2,2020,South of River,16.802103,118.965935,190
3,2021,East/Hills,17.832225,145.129224,59
4,2021,North of River,18.475315,146.037248,198
5,2021,South of River,18.326949,145.465984,221
6,2022,East/Hills,17.468233,180.375608,58
7,2022,North of River,17.838685,180.486285,187
8,2022,South of River,17.887519,179.572958,223
9,2023,East/Hills,14.733387,183.402794,68
