In [2]:
"""
Social Isolation Analysis - Living Alone Data
Cell 1: Setup and Download Data
"""

import pandas as pd
import numpy as np
import requests
import zipfile
from pathlib import Path
import sys

# Define paths
BASE_DIR = Path.cwd().parent
DATA_DIR = BASE_DIR / "data"
POPULATION_DIR = DATA_DIR / "raw" / "population"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

print("=" * 60)
print("SOCIAL ISOLATION ANALYSIS - LIVING ALONE DATA")
print("=" * 60)

print(f"\nüìÅ Data directory: {POPULATION_DIR}")
print(f"üìÅ Processed directory: {PROCESSED_DATA_DIR}")

# Download the Couples-Familles-M√©nages data
print("\nüì• Downloading household composition data (M√©nages 2020)...")

url = "https://www.insee.fr/fr/statistiques/fichier/7633206/base-cc-coupl-fam-men-2020_csv.zip"
destination = POPULATION_DIR / "menages_2020.zip"

try:
    response = requests.get(url, timeout=90)
    response.raise_for_status()
    
    with open(destination, 'wb') as f:
        f.write(response.content)
    
    print(f"‚úÖ Downloaded: {destination.name}")
    print(f"   Size: {destination.stat().st_size / 1_000_000:.2f} MB")
    
    # Extract
    print("\nüì¶ Extracting...")
    with zipfile.ZipFile(destination, 'r') as zip_ref:
        zip_ref.extractall(POPULATION_DIR / "menages_2020")
    
    print("‚úÖ Extracted!")
    
    # List files
    extracted_dir = POPULATION_DIR / "menages_2020"
    print(f"\nüìÅ Extracted files:")
    for file in sorted(extracted_dir.iterdir()):
        if file.is_file():
            size_mb = file.stat().st_size / 1_000_000
            print(f"  - {file.name} ({size_mb:.2f} MB)")
    
except Exception as e:
    print(f"‚ùå Error: {e}")
    print("\nüí° Manual download:")
    print("Go to: https://www.insee.fr/fr/statistiques/7633206")
    print("Download 'base-cc-couples-familles-menages-2020_csv.zip'")
    print(f"Extract to: {POPULATION_DIR / 'menages_2020'}")

SOCIAL ISOLATION ANALYSIS - LIVING ALONE DATA

üìÅ Data directory: /Users/antoineverhulst/Documents/Project/claude/heat_risk_france/data/raw/population
üìÅ Processed directory: /Users/antoineverhulst/Documents/Project/claude/heat_risk_france/data/processed

üì• Downloading household composition data (M√©nages 2020)...
‚úÖ Downloaded: menages_2020.zip
   Size: 31.58 MB

üì¶ Extracting...
‚úÖ Extracted!

üìÅ Extracted files:
  - FILO2020_DEC_COM.csv (48.75 MB)
  - FILO2020_DEC_PAUVRES_COM.csv (10.44 MB)
  - FILO2020_DISP_COM.csv (61.37 MB)
  - FILO2020_DISP_PAUVRES_COM.csv (10.54 MB)
  - FILO2020_TRDECILES_DEC_COM.csv (5.01 MB)
  - FILO2020_TRDECILES_DISP_COM.csv (9.00 MB)
  - base-cc-coupl-fam-men-2020.CSV (101.86 MB)
  - meta_FILO2020_DEC_COM.csv (2.91 MB)
  - meta_FILO2020_DEC_PAUVRES_COM.csv (2.86 MB)
  - meta_FILO2020_DISP_COM.csv (2.94 MB)
  - meta_FILO2020_DISP_PAUVRES_COM.csv (2.86 MB)
  - meta_FILO2020_TRDECILES_DEC_COM.csv (2.85 MB)
  - meta_FILO2020_TRDECILES_DISP_COM.csv

In [5]:
"""
Cell 4: Download POP4 - Population by age and couple status
"""

print("=" * 60)
print("DOWNLOADING POP4 - LIVING ARRANGEMENTS DATA")
print("=" * 60)

url = "https://www.insee.fr/fr/statistiques/fichier/7631680/TD_POP4_2020_csv.zip"
destination = POPULATION_DIR / "TD_POP4_2020.zip"

print(f"\nüì• Downloading POP4 (Population 15+ by couple status)...")
print("This has data on people living alone vs in couples")

try:
    response = requests.get(url, timeout=90)
    response.raise_for_status()
    
    with open(destination, 'wb') as f:
        f.write(response.content)
    
    print(f"‚úÖ Downloaded: {destination.name}")
    print(f"   Size: {destination.stat().st_size / 1_000_000:.2f} MB")
    
    # Extract
    print("\nüì¶ Extracting...")
    with zipfile.ZipFile(destination, 'r') as zip_ref:
        zip_ref.extractall(POPULATION_DIR / "POP4")
    
    print("‚úÖ Extracted!")
    
    # List files
    pop4_dir = POPULATION_DIR / "POP4"
    print(f"\nüìÅ Files in POP4:")
    for file in sorted(pop4_dir.iterdir()):
        if file.is_file():
            size_mb = file.stat().st_size / 1_000_000
            print(f"  - {file.name} ({size_mb:.2f} MB)")
    
    # Load and preview
    csv_file = list(pop4_dir.glob("*.csv"))[0]
    print(f"\nüëÄ Loading and previewing {csv_file.name}...")
    
    df = pd.read_csv(csv_file, sep=';', nrows=10, low_memory=False)
    
    print(f"\nüìã Dataset info:")
    print(f"   - Columns: {len(df.columns)}")
    print(f"\nüìã First 20 column names:")
    for i, col in enumerate(df.columns[:20]):
        print(f"   {i+1}. {col}")
    
    print(f"\nüëÄ First 3 rows:")
    print(df.head(3))
    
except Exception as e:
    print(f"‚ùå Error: {e}")
    print("\nPlease download manually from:")
    print("https://www.insee.fr/fr/statistiques/fichier/7631680/TD_POP4_2020_csv.zip")

DOWNLOADING POP4 - LIVING ARRANGEMENTS DATA

üì• Downloading POP4 (Population 15+ by couple status)...
This has data on people living alone vs in couples
‚úÖ Downloaded: TD_POP4_2020.zip
   Size: 13.87 MB

üì¶ Extracting...
‚úÖ Extracted!

üìÅ Files in POP4:
  - TD_POP4_2020.csv (78.24 MB)

üëÄ Loading and previewing TD_POP4_2020.csv...

üìã Dataset info:
   - Columns: 7

üìã First 20 column names:
   1. NIVGEO
   2. CODGEO
   3. LIBGEO
   4. SEXE
   5. AGEQ80_14
   6. COUPLE
   7. NB

üëÄ First 3 rows:
  NIVGEO  CODGEO                   LIBGEO  SEXE  AGEQ80_14  COUPLE         NB
0    COM    1001  L'Abergement-Cl√©menciat     1         15       1   0.955466
1    COM    1001  L'Abergement-Cl√©menciat     1         15       2  22.936741
2    COM    1001  L'Abergement-Cl√©menciat     1         20       1   0.955466


In [5]:
"""
Cell 5: Calculate percentage living alone by commune
Focus on elderly (65+) population
"""

print("=" * 60)
print("CALCULATING % ELDERLY LIVING ALONE BY COMMUNE")
print("=" * 60)

# Load full POP4 data
pop4_file = POPULATION_DIR / "POP4" / "TD_POP4_2020.csv"
print(f"\nüìÇ Loading: {pop4_file.name}")

pop4_data = pd.read_csv(pop4_file, sep=';', low_memory=False)

print(f"‚úÖ Loaded {len(pop4_data):,} rows")

# Check unique values
print(f"\nüîç Data structure:")
print(f"   - COUPLE values: {sorted(pop4_data['COUPLE'].unique())}")
print(f"   - AGEQ80_14 values: {sorted(pop4_data['AGEQ80_14'].unique())}")

# COUPLE: 1 = living in couple, 2 = not living in couple (alone)
# AGEQ80_14 age groups (quinquennal):
# We want 65+ which is: 065, 070, 075, 080

# Filter for elderly (65+) only
elderly_ages = [65, 70, 75, 80]
elderly_data = pop4_data[pop4_data['AGEQ80_14'].isin(elderly_ages)].copy()

print(f"\nüìä Filtered for elderly (65+): {len(elderly_data):,} rows")

# Calculate totals by commune and couple status
elderly_by_commune = elderly_data.groupby(['CODGEO', 'COUPLE'])['NB'].sum().reset_index()

# Pivot to get living alone vs in couple
elderly_pivot = elderly_by_commune.pivot(index='CODGEO', columns='COUPLE', values='NB')
elderly_pivot.columns = ['in_couple', 'living_alone']
elderly_pivot = elderly_pivot.fillna(0)

# Calculate total elderly and percentage
elderly_pivot['total_elderly'] = elderly_pivot['in_couple'] + elderly_pivot['living_alone']
elderly_pivot['pct_elderly_alone'] = (elderly_pivot['living_alone'] / elderly_pivot['total_elderly'] * 100).round(2)

# Reset index
elderly_alone = elderly_pivot.reset_index()

print(f"\n‚úÖ Calculated isolation for {len(elderly_alone):,} communes")

print(f"\nüìà Summary statistics:")
print(elderly_alone[['total_elderly', 'living_alone', 'pct_elderly_alone']].describe())

# Filter for Paris arrondissements
paris_alone = elderly_alone[
    (elderly_alone['CODGEO'].str.startswith('75')) & 
    (elderly_alone['CODGEO'] != '75056')  # Exclude overall Paris
].copy()

print(f"\nüóº Paris arrondissements: {len(paris_alone)}")
if len(paris_alone) > 0:
    print("\nParis elderly living alone:")
    paris_alone_sorted = paris_alone.sort_values('pct_elderly_alone', ascending=False)
    print(paris_alone_sorted[['CODGEO', 'total_elderly', 'living_alone', 'pct_elderly_alone']])
    
    print(f"\nüìä Paris statistics:")
    print(f"   - Average % elderly living alone: {paris_alone['pct_elderly_alone'].mean():.2f}%")
    print(f"   - Range: {paris_alone['pct_elderly_alone'].min():.1f}% - {paris_alone['pct_elderly_alone'].max():.1f}%")
    print(f"   - Highest: {paris_alone_sorted.iloc[0]['CODGEO']} ({paris_alone_sorted.iloc[0]['pct_elderly_alone']:.1f}%)")
    print(f"   - Lowest: {paris_alone_sorted.iloc[-1]['CODGEO']} ({paris_alone_sorted.iloc[-1]['pct_elderly_alone']:.1f}%)")

print("\n‚úÖ Social isolation data calculated!")

# Add % elderly of total population by loading our previous data
print("\n" + "=" * 60)
print("ADDING % ELDERLY OF TOTAL POPULATION")
print("=" * 60)

# Load the vulnerability data we created earlier
vuln_file = PROCESSED_DATA_DIR / "paris_vulnerability.csv"

if vuln_file.exists():
    print(f"\nüìÇ Loading existing vulnerability data...")
    vuln_data = pd.read_csv(vuln_file)
    
    # Convert CODGEO to string in both datasets for consistent merge
    elderly_alone['CODGEO'] = elderly_alone['CODGEO'].astype(str)
    vuln_data['CODGEO'] = vuln_data['CODGEO'].astype(str)
    
    # Merge with isolation data
    paris_combined = paris_alone.merge(
        vuln_data[['CODGEO', 'pct_65plus', 'pop_total', 'pop_65plus']],
        on='CODGEO',
        how='left'
    )
    
    print(f"‚úÖ Merged data for {len(paris_combined)} arrondissements")
    
    print(f"\nüìä Combined metrics:")
    print(paris_combined[['CODGEO', 'pct_65plus', 'pct_elderly_alone', 'total_elderly', 'living_alone']])
    
    print(f"\nüéØ Key insights:")
    print(f"   - Average % elderly (65+): {paris_combined['pct_65plus'].mean():.2f}%")
    print(f"   - Average % elderly living alone: {paris_combined['pct_elderly_alone'].mean():.2f}%")
    
    # Most isolated arrondissement
    most_isolated = paris_combined.loc[paris_combined['pct_elderly_alone'].idxmax()]
    print(f"\nüî¥ Most socially isolated: {most_isolated['CODGEO']}")
    print(f"   - {most_isolated['pct_65plus']:.1f}% of population is elderly")
    print(f"   - {most_isolated['pct_elderly_alone']:.1f}% of elderly live alone")
    print(f"   - {most_isolated['living_alone']:.0f} elderly people living alone")
    
    # Store combined data
    paris_combined_final = paris_combined
    
else:
    print(f"‚ö†Ô∏è  Vulnerability file not found: {vuln_file}")
    print("We have isolation data but not % elderly of total population")
    paris_combined_final = paris_alone

CALCULATING % ELDERLY LIVING ALONE BY COMMUNE

üìÇ Loading: TD_POP4_2020.csv
‚úÖ Loaded 1,706,581 rows

üîç Data structure:
   - COUPLE values: [1, 2]
   - AGEQ80_14 values: [15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

üìä Filtered for elderly (65+): 504,244 rows

‚úÖ Calculated isolation for 34,966 communes

üìà Summary statistics:
       total_elderly   living_alone  pct_elderly_alone
count   34966.000000   34966.000000       34960.000000
mean      405.402417     168.353132          35.586217
std      2681.728750    1329.742242          10.351312
min         0.000000       0.000000           0.000000
25%        43.750724      14.796242          28.757500
50%        95.809148      31.389002          34.560000
75%       248.326857      85.276640          41.380000
max    367946.685877  185258.999257         100.000000

üóº Paris arrondissements: 20

Paris elderly living alone:
      CODGEO  total_elderly  living_alone  pct_elderly_alone
29288  75118   26357.750186  14