# Total population (actual and forecasted) per district, and optionally compute the growth.

In [None]:
import pandas as pd

# Load the forecast dataset with full path
csv_path = r"KTZH_area_forecast.csv"
forecast_df = pd.read_csv(csv_path)

print(f"✅ Dataset loaded: {len(forecast_df)} rows")
print("Available years:", sorted(forecast_df['year'].unique()))
print("Data types:", forecast_df['data type'].unique())

# Filter the dataset to keep only specific years
# 2023: Latest actual data (historical)
# 2030: Short-term forecast
# 2045: Long-term forecast
filtered_df = forecast_df[forecast_df['year'].isin([2023, 2050])]

print(f"📊 Filtered dataset: {len(filtered_df)} rows for years 2023, 2050")

# Group by district, data type, and year, then sum the population
population_summary = filtered_df.groupby(['district', 'data type', 'year'])['number'].sum().reset_index()

# Pivot to have years as columns
population_pivot = population_summary.pivot_table(
    index=['district', 'data type'], 
    columns='year', 
    values='number', 
    fill_value=0
).reset_index()

# Flatten column names
population_pivot.columns.name = None
population_pivot.columns = ['district', 'data_type'] + [f'pop_{year}' for year in [2023, 2050]]

print("📈 Population data structure:")
print(population_pivot.head())

# Filter for actual vs forecast data types
actual_data = population_pivot[population_pivot['data_type'] == 'Pop_Actual'].copy()
forecast_data = population_pivot[population_pivot['data_type'] == 'Pop_Forecast'].copy()

# Merge actual and forecast data
if len(actual_data) > 0 and len(forecast_data) > 0:
    # Use 2023 as baseline (actual), 2030 and 2045 as forecasts
    population_analysis = actual_data[['district', 'pop_2023']].merge(
        forecast_data[['district', 'pop_2050']], 
        on='district', 
        how='outer'
    ).fillna(0)
    
    
    population_analysis['growth_rate_2050'] = (
        (population_analysis['pop_2050'] - population_analysis['pop_2023']) /
        population_analysis['pop_2023']
    ) * 100
    
else:
    # Alternative approach: use the filtered data directly by year
    population_2023 = filtered_df[filtered_df['year'] == 2023].groupby('district')['number'].sum()
    population_2050 = filtered_df[filtered_df['year'] == 2050].groupby('district')['number'].sum()
    
    # Create analysis DataFrame
    population_analysis = pd.DataFrame({
        'district': population_2023.index,
        'pop_2023': population_2023.values,
        'pop_2050': population_2050.reindex(population_2023.index, fill_value=0).values
    })
    
    # Calculate growth rates  
    population_analysis['growth_rate_2050'] = (
        (population_analysis['pop_2050'] - population_analysis['pop_2023']) /
        population_analysis['pop_2023']
    ) * 100

print("\n📊 Population Analysis by District:")
print("="*80)

# Display formatted results
for idx, row in population_analysis.iterrows():
    print(f"🏘️  {row['district']}")
    print(f"   2023 (Actual): {row['pop_2023']:,.0f}")
    print(f"   2050 (Forecast): {row['pop_2050']:,.0f} ({row['growth_rate_2050']:+.1f}%)")
    print("-" * 50)

# Summary statistics
print(f"\n📈 Summary:")
print(f"Average growth 2023-2050: {population_analysis['growth_rate_2050'].mean():.1f}%")

# Add the km2 area for each district
area = pd.read_csv('Area_size.csv')
population_analysis = pd.merge(population_analysis, area, on='district', how='left')

# Calculate the density of the district and absolute growth population
population_analysis['growth_abs'] = population_analysis['pop_2050']- population_analysis['pop_2023']
population_analysis['dencity_growth'] = population_analysis['growth_abs']/population_analysis['Area (Area km²)']
population_analysis = population_analysis.sort_values(by='dencity_growth', ascending=False)

# Display styled DataFrame
styled_df = population_analysis.style.format({
    'pop_2023': '{:,.0f}',
    'pop_2050': '{:,.0f}',
    'growth_rate_2050': '{:+.2f}%',
    'Area (Area km²)':'{:,.1f}',
    'growth_abs':'{:,.1f}',
    'dencity_growth':'{:,.2f}'
})

display(styled_df)

# SPAR Presence vs. Total Supermarket

In [None]:
import pandas as pd

# Load the supermarket dataset
supermarkets_df = pd.read_csv('all_supermarkets_with_size_and_rating.csv')

# Count total supermarkets per district
total_supermarkets = supermarkets_df.groupby('district').size().reset_index(name='total_supermarkets')

# Count SPAR supermarkets per district
spar_supermarkets = supermarkets_df[supermarkets_df['name'].str.contains('spar', case=False)]
spar_counts = spar_supermarkets.groupby('district').size().reset_index(name='spar_supermarkets')

# Merge both counts together
supermarket_coverage = pd.merge(total_supermarkets, spar_counts, on='district', how='left')

# Fill missing SPAR counts with 0
supermarket_coverage['spar_supermarkets'] = supermarket_coverage['spar_supermarkets'].fillna(0).astype(int)

# Preview result
supermarket_coverage.sort_values(by='spar_supermarkets', ascending=True).head(12)


# Identify High-Opportunity Districts for SPAR Expansion

In [None]:
# Merge population growth data with supermarket counts
opportunity_df = pd.merge(population_analysis, supermarket_coverage, on='district', how='left')

# Calculate SPAR coverage rate and people per SPAR (use NaN where SPAR count is zero)
opportunity_df['spar_coverage_rate'] = opportunity_df['spar_supermarkets'] / opportunity_df['total_supermarkets']
opportunity_df['people_per_spar'] = opportunity_df['pop_2050'] / opportunity_df['spar_supermarkets'].replace(0, pd.NA)
opportunity_df['people_per_supermarket']=opportunity_df['pop_2050'] / opportunity_df['total_supermarkets']

# Rank districts by highest growth and lowest SPAR coverage
opportunity_df['growth_rate'] = opportunity_df['growth_rate_2050']  # use 2045 growth for ranking
best_opportunities = opportunity_df.sort_values(by=['dencity_growth', 'people_per_supermarket'], ascending=[False, True])

# Display top 5 opportunities
best_opportunities[['district', 'pop_2050', 'dencity_growth', 'total_supermarkets',
                    'spar_supermarkets', 'spar_coverage_rate', 'people_per_spar']].head(5)

# Format large numbers with commas and round decimals
best_opportunities[['district', 'pop_2050', 'dencity_growth', 'total_supermarkets',
                    'spar_supermarkets', 'spar_coverage_rate', 'people_per_spar', 'people_per_supermarket']
].style.format({
    'pop_2050': '{:,.0f}',
    'dencity_growth': '{:.2f}',
    'spar_coverage_rate': '{:.2%}',
    'people_per_spar': '{:,.0f}',
    'people_per_supermarket': '{:,.0f}'
})


In [None]:
import matplotlib.pyplot as plt

# Sort by people_per_spar and take top 10
plot_df = best_opportunities.sort_values('people_per_spar', ascending=True).head(12)

plt.figure(figsize=(12, 6))
bars = plt.bar(plot_df['district'], plot_df['people_per_spar'], color='skyblue')

# Add labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height, f'{height:,.0f}', ha='center', va='bottom')

plt.title('Districts by People per SPAR Branch')
plt.ylabel('People per SPAR')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
# Sort by people_per_super and take top 10
plot_df = best_opportunities.sort_values('people_per_supermarket', ascending=True).head(12)


x = np.arange(len(plot_df['district']))  # the label locations
width = 0.35  # the width of the bars

plt.figure(figsize=(12, 6))
plt.bar(x-width, plot_df['people_per_spar'], color='skyblue', label = 'density per spar')
plt.bar(x+width, plot_df['people_per_supermarket'], color='blue', label = 'density per super market')

# Add labels
#for bar in bars:
    #height = bar.get_height()
    #plt.text(bar.get_x() + bar.get_width()/2, height, f'{height:,.0f}', ha='center', va='bottom')

plt.title('Density per supermarket by district')
plt.ylabel('Density per supermarket')
plt.xticks(x, plot_df['district'], rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
opportunity_df

# Score and Rank Districts
We combine multiple key factors into a single score that helps SPAR identify the most strategic districts for expansion.

| Factor               | Why it Matters                          | Scoring Direction |
| -------------------- | --------------------------------------- | ----------------- |
| `Pop_Forecast`       | Indicates future demand                 | Higher is better  |
| `density_growth`        | Measures how fast demand is growing     | Higher is better  |
| `spar_coverage_rate` | Shows how much SPAR is already present  | Lower is better   |
| `people_per_supermarket` | Indicates how competitive the market is | Lower is better   |


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Copy relevant columns (using updated column names)
scoring_df = opportunity_df[['district', 'pop_2050', 'dencity_growth', 'spar_coverage_rate', 'people_per_supermarket']].copy()

# Normalize values between 0 and 1
scaler = MinMaxScaler()
scoring_df[['pop_2050', 'dencity_growth']] = scaler.fit_transform(scoring_df[['pop_2050', 'dencity_growth']])
scoring_df[['spar_coverage_rate', 'people_per_supermarket']] = 1 - scaler.fit_transform(scoring_df[['spar_coverage_rate', 'people_per_supermarket']])

# Composite score (weights can be adjusted)
scoring_df['score'] = (
    scoring_df['pop_2050'] * 0.3 +
    scoring_df['dencity_growth'] * 0.3 +
    scoring_df['spar_coverage_rate'] * 0.2 +
    scoring_df['people_per_supermarket'] * 0.2
)

# Sort and show top 12
top_ranked = scoring_df.sort_values(by='score', ascending=False)

# Format the top 12 rows with readable numbers
top_ranked.head(12).style.format({
    'pop_2050': '{:,.0f}',
    'dencity_growth': '{:.2f}',
    'spar_coverage_rate': '{:.2%}',
    'people_per_supermarket': '{:,.1f}',
    'score': '{:.3f}'
})


📊 ***This table shows:*** 

Top-ranked districts for SPAR expansion based on a combined score calculated from: 

1- Forecasted population (Pop_Forecast) — how big the future market is 

2- Growth rate (growth_rate) — how fast that market is growing 

3- SPAR coverage rate (spar_coverage_rate) — how saturated the district is with SPAR branches (lower is better) 

4- Total supermarkets (total_supermarkets) — how competitive the area is (lower is better)

In [None]:
import folium

# Mock lat/lon for demonstration — replace with real ones!
coordinates = {
    'Dietikon': [47.4, 8.4],
    'Bülach': [47.5, 8.5],
    'Uster': [47.3, 8.7],
    'Zürich': [47.37, 8.55],
    'Winterthur': [47.5, 8.7]
}

# Create map centered on Zurich
m = folium.Map(location=[47.4, 8.5], zoom_start=10)

# Get top 5 districts
top_5_districts = top_ranked.head(5)

# Add markers
for _, row in top_5_districts.iterrows():
    district = row['district']
    if district in coordinates:
        folium.Marker(
            location=coordinates[district],
            popup=f"{district}\nScore: {row['score']:.3f}",
            tooltip=district,
            icon=folium.Icon(color='green')
        ).add_to(m)

# Show the map
m
