# Fast Food Franchise Location Prediction in Sydney Using Geospatial and Machine Learning Analysi
"Identifying High-Potential Suburbs for New Outlets Based on Population Density, Competition, and Predictive Modeling"

In [None]:
import osmnx as ox

import pandas as pd
import geopandas as gpd
import os

from geopy.geocoders import Nominatim
import time

In [None]:

def is_kaggle():
    return os.path.exists("/kaggle/input")

if is_kaggle():
    path = "/kaggle/input/"
else:
    path = "data"

**Cell 1:** Import all required libraries for geospatial analysis, data manipulation, and geocoding.

In [None]:

# Make sure logging is off to reduce noise
ox.settings.log_console = False

# Define tags
tags = {"amenity": "fast_food", "name": ["McDonald's", "KFC", "Subway"]}

# Get fast food places in Sydney
gdf = ox.features_from_place("Sydney, Australia", tags=tags)








**Cell 2:** Configure OSMnx settings, define tags for fast food venues, and download fast food locations in Sydney.

In [None]:
columns_to_keep = ['name', 'geometry','brand','branch','addr:street']
df=gdf[columns_to_keep]


**Cell 3:** Select and keep only relevant columns from the downloaded fast food data.

In [None]:
# # Visualize fast food locations on an interactive map

# df.explore(
#     column="name",  # Color points by name
#     tooltip=["name", "brand", "addr:street"],  # Show info on hover
#     marker_kwds={"radius": 6},  # Marker size
#     style_kwds={"fillOpacity": 0.7},
#     height=500,
#     width=800
# )

**Cell 4:** (Commented out) Example code for visualizing fast food locations on an interactive map.

In [None]:
df.head()
#This can be changed to any franchise brand
brand="McDonald's"

**Cell 5:** Preview the fast food DataFrame and set the franchise brand to analyze (e.g., McDonald's).

In [None]:
df['brand'].unique().tolist()


**Cell 6:** List all unique fast food brands present in the data.

In [None]:
test_filtered = df[df['brand'] == brand]
test_filtered.head()

In [None]:
# Step 1: Reproject to a projected CRS (meters) if needed
if test_filtered.crs.is_geographic:
    test_filtered = test_filtered.to_crs(epsg=3857)

# Step 2: Calculate centroids and store in a new column
test_filtered["geometry_centroid"] = test_filtered.geometry.centroid

# Optional Step 3: Reproject centroids back to lat/lon (EPSG:4326)
test_filtered["geometry_centroid_latlon"] = (
    test_filtered["geometry_centroid"].to_crs(epsg=4326)
)

In [None]:
sa2_gdf = gpd.read_file(path+"/SA2_2021_AUST_SHP_GDA2020/SA2_2021_AUST_GDA2020.shp")





sa2_nsw = sa2_gdf[sa2_gdf['STE_NAME21'] == 'New South Wales']

sa2 = sa2_nsw[sa2_nsw['GCC_NAME21'] == 'Greater Sydney']


In [None]:
sa2_nsw['SA3_NAME21'].unique().tolist()



In [None]:
# sa2.explore(
#     column="SA2_NAME21",  # Color points by name
#     marker_kwds={"radius": 6},  # Marker size
#     style_kwds={"fillOpacity": 0.7},
#     height=500,
#     width=800
# )

In [None]:
census_data = pd.read_csv(path+'/2021Census_G01_NSW_SA2.csv')
census_data.head()

In [None]:
sa2=sa2[['SA2_CODE21', 'SA2_NAME21', 'geometry']]
census = census_data[['SA2_CODE_2021', 'Tot_P_P']]
census['SA2_CODE21']= census['SA2_CODE_2021'].astype(str)

sa2 = sa2.merge(census, on='SA2_CODE21', how='left')
sa2.head()


In [None]:
# Convert to projected CRS (e.g., Australian Albers EPSG:3577)
sa2 = sa2.to_crs(epsg=3577)

# Calculate area in square kilometers
sa2['area_km2'] = sa2['geometry'].area / 1e6
sa2['pop_density'] = sa2['Tot_P_P'] / sa2['area_km2']



In [None]:
# Step 0: Make sure both GeoDataFrames are in the same projected CRS
if test_filtered.crs != sa2.crs:
    sa2 = sa2.to_crs(test_filtered.crs)

# Step 1: Define function to get closest polygon (row) from sa2
def get_nearest_suburb(point, suburbs_gdf):
    distances = suburbs_gdf.geometry.distance(point)
    return suburbs_gdf.loc[distances.idxmin()]

# Step 2: Apply the function to each centroid in test_filtered
# You can choose what attribute to extract (e.g., 'suburb_name', 'LGA_CODE', etc.)
test_filtered['nearest_suburb'] = test_filtered['geometry_centroid'].apply(
    lambda pt: get_nearest_suburb(pt, sa2)['SA2_NAME21']
)


In [None]:
suburb_list=sa2
fast_food=test_filtered[['name','geometry','nearest_suburb','geometry_centroid']]


In [None]:
# Step 1: Count how many times each suburb appears
suburb_counts = fast_food["nearest_suburb"].value_counts().reset_index()

# Step 2: Rename columns for clarity
suburb_counts.columns = ["suburb", "fast_food_count"]

# View result


In [None]:
suburb_list = suburb_list.merge(suburb_counts, left_on='SA2_NAME21', right_on='suburb', how="left")

# Step 3: Fill missing counts with 0 (for suburbs with no fast food matches)
suburb_list["fast_food_count"] = suburb_list["fast_food_count"].fillna(0).astype(int)

In [None]:
import matplotlib.pyplot as plt

# Buffer around existing fast food locations (5 km)
fast_food["buffer"] = fast_food.geometry.buffer(5000)

# Plot existing fast food locations and buffers
ax = suburb_list.plot(color='lightgrey', edgecolor='white')
fast_food.set_geometry("buffer").plot(ax=ax, color='red', alpha=0.3)
fast_food.set_geometry("geometry").plot(ax=ax, color='black', markersize=5)

plt.title("Existing Coverage ")
plt.show()

In [None]:
import geopandas as gpd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Project both to the same projected CRS (for distance accuracy)
suburb_list_proj = suburb_list.to_crs(epsg=3857).copy()
fast_food = fast_food.to_crs(suburb_list_proj.crs)

# Step 2: Calculate centroid for each SA2 area
suburb_list_proj['centroid'] = suburb_list_proj.geometry.centroid

# Step 3: Calculate minimum distance from each centroid to existing fast food venues
suburb_list_proj['min_dist_to_fastfood_km'] = suburb_list_proj['centroid'].apply(
    lambda x: fast_food.distance(x).min() / 1000  # convert to kilometers
)

# Step 2: Calculate area in square kilometers
suburb_list_proj.loc[:, 'area_km2'] = suburb_list_proj['geometry'].area / 1_000_000

# Step 3: Calculate population density
suburb_list_proj.loc[:, 'pop_density'] = suburb_list_proj['Tot_P_P'] / suburb_list_proj['area_km2']


# Step 6: Get top 10 suggested SA2 areas


# Show results


In [None]:
subset = suburb_list[suburb_list['fast_food_count'] == 1]

# Summary stats for pop_density
stats = suburb_list['pop_density'].describe()  # includes count, mean, std, min, 25%, 50%, 75%, max

print(stats)

In [None]:
# Step 4: Normalize both metrics (density and distance)





new_list = suburb_list_proj


scaler = MinMaxScaler()
new_list[['norm_density', 'norm_dist']] = scaler.fit_transform(
    new_list[['pop_density', 'min_dist_to_fastfood_km']]
)

# Step 5: Create a composite score (weights: 0.7 for density, 0.3 for distance)
new_list['location_score'] = (
    new_list['norm_density'] * 0.7 + new_list['norm_dist'] * 0.3
)


In [None]:
stats = new_list['location_score'].describe()  # includes count, mean, std, min, 25%, 50%, 75%, max
print(stats)

In [None]:
POP_DENSITY_THRESHOLD = 1000
DISTANCE_TO_MCD_THRESHOLD_KM = 2.0

# Step 2: Filter the suburbs that meet both conditions
candidate_suburbs = new_list[
    (new_list['pop_density'] > POP_DENSITY_THRESHOLD) &
    (new_list['min_dist_to_fastfood_km'] > DISTANCE_TO_MCD_THRESHOLD_KM)
]

# Step 3: Preview the top candidates
top_areas = candidate_suburbs.sort_values(by='pop_density', ascending=False).head(3)
top_areas['Nearest_franchise_location(in km)']=top_areas['min_dist_to_fastfood_km']
top_areas['Suburb Name']=top_areas['SA2_NAME21']
top_areas['Population density']=top_areas['pop_density']
top_areas[['Suburb Name', 'Population density', 'Nearest_franchise_location(in km)', 'location_score']]


In [None]:
new_list.head()

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as cx
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
# If running in a Jupyter notebook, use the following to install adjustText:
from adjustText import adjust_text


# Step 1: Get top 3 recommended SA2 areas

# Step 2: Reproject all data to EPSG:3857
fast_food_wm = fast_food.to_crs(epsg=3857)
suburb_list_wm = suburb_list.to_crs(epsg=3857)
top_areas_wm = top_areas.to_crs(epsg=3857)

# Step 3: Create combined GeoDataFrame to compute zoom bounds
combined = pd.concat([fast_food_wm[['geometry']], top_areas_wm[['geometry']]], ignore_index=True)


minx, miny, maxx, maxy = combined.total_bounds
zoom_margin = 5000  # 5 km
xlim = (minx - zoom_margin, maxx + zoom_margin)
ylim = (miny - zoom_margin, maxy + zoom_margin)

# Step 4: Plot
fig, ax = plt.subplots(figsize=(12, 10))

# Plot all suburbs
# suburb_list_wm.plot(ax=ax, facecolor='white', edgecolor='grey', linewidth=0.5)

# Plot fast food venues
fast_food_wm.plot(ax=ax, color='blue', markersize=5)

# Plot top 3 suggested SA2 locations
top_areas.plot(ax=ax, edgecolor='red', facecolor='none', linewidth=2)

# Annotate each top area with its name
texts = []
for idx, row in top_areas.iterrows():
    texts.append(
         plt.text(
                row.geometry.centroid.x + 5000 ,
                row.geometry.centroid.y + 5000,
                row['SA2_NAME21'],
                fontsize=12, color='purple', ha='left', va='center',
                bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.3')
            )
    )
    
adjust_text(texts, arrowprops=dict(arrowstyle='->', color='gray'))

# Set axis limits
ax.set_xlim(xlim)
ax.set_ylim(ylim)

# Add basemap
cx.add_basemap(ax, source=cx.providers.OpenStreetMap.Mapnik)

# Legend
legend_elements = [
    Patch(facecolor='white', edgecolor='grey', label='Suburbs'),
    Line2D([0], [0], marker='o', color='w', label='Fast Food Venue', markerfacecolor='blue', markersize=6),
    Line2D([0], [0], marker='o', color='w', label='Top 3 Suggested Areas', markerfacecolor='red', markersize=10)
]
ax.legend(handles=legend_elements)

# Title
ax.set_title("Top 3 Suggested  Locations for New Fast Food Venue,", fontsize=14)
plt.tight_layout()
plt.show()
