# City Scan Data Cleaning
##### June 2025

Basic data cleaning pipeline for appropriate CSV preparation necessary for City Scan JavaScript plots with Cartagena, Colombia as the case study example city for pipeline scaling

In [1]:
# standard library imports
import os
import sys

# add project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# third-party imports
import numpy as np
import pandas as pd

# change to project root directory
os.chdir('../')
print("directory changes")
print(f"current working directory is:", os.getcwd())

# local imports (after changing directory)
from src.clean import clean_pg, clean_pas, clean_uba


directory changes
current working directory is: /Users/carolinecullinan/dev/wb/city-scan-csv-viz-prep


# POPULATION AND DEMOGRAPHIC TRENDS

### pg.csv preparation
### Observable Notebook functions/charts:
#### 1.) "plot_pga" / "chart_pga" ; and
#### 2.) "plot_pgp" / "chart_pg"

In [None]:
# POPULATION & DEMOGRAPHIC TRENDS - pg.csv preparation for Observable Notebook plot functions/charts:
# 1.) "plot_pga"/"chart_pga" (absolute population growth); and 
# 2.) "plot_pgp"/"chart_pgp" (population growth percentage)

# load "raw" (i.e. "dirty") tabular output data
# NOTE: right now, csv file is for Tunis, Tunisia because no access to population-growth.csv for Cartagena, Colombia - fix later with appropriate data - this is just a placeholder for pipeline purposes
raw_df_pg = pd.read_csv('data/raw/population-growth.csv')

# display basic info about the raw data
print("Raw population growth data info:")
print(f"Shape: {raw_df_pg.shape}")
print(f"Columns: {list(raw_df_pg.columns)}")
print(f"Date range: {raw_df_pg['Year'].min()} - {raw_df_pg['Year'].max()}")
print(f"Data preview:")
print(raw_df_pg.head())
print("\n" + "="*50 + "\n")

# clean the data using our clean_pg function
try:
    cleaned_df_pg = clean_pg('data/raw/population-growth.csv')
    print("✅ Population growth data cleaned successfully!")
    
    # display cleaned data info
    print(f"\nCleaned data shape: {cleaned_df_pg.shape}")
    print(f"Cleaned data columns: {list(cleaned_df_pg.columns)}")
    print(f"Sample of cleaned data:")
    print(cleaned_df_pg.head(10))
    
    # basic data validation
    print(f"\nData validation:")
    print(f"- Missing values: {cleaned_df_pg.isnull().sum().sum()}")
    print(f"- Year range: {cleaned_df_pg['yearName'].min()} - {cleaned_df_pg['yearName'].max()}")
    print(f"- Population range: {cleaned_df_pg['population'].min():,} - {cleaned_df_pg['population'].max():,}")
    print(f"- Growth rate range: {cleaned_df_pg['populationGrowthPercentage'].min():.3f}% - {cleaned_df_pg['populationGrowthPercentage'].max():.3f}%")
    
    # check for any potential data quality issues
    if cleaned_df_pg['populationGrowthPercentage'].isna().sum() > 0:
        print(f"⚠️  Note: {cleaned_df_pg['populationGrowthPercentage'].isna().sum()} missing growth rate values (expected for first year)")
    
except Exception as e:
    print(f"❌ Error cleaning population growth data: {e}")
    print("Check that 'data/raw/population-growth.csv' exists and has the correct format")

# save the cleaned data as a CSV file - pg.csv, and export
# (This is handled automatically by the clean_pg function, but confirming)
if 'cleaned_df_pg' in locals():
    print(f"\n📁 Cleaned data saved to: data/processed/pg.csv")
    print(f"✅ Ready for Observable visualization!")
else:
    print("❌ No cleaned data available to save")

Raw population growth data info:
Shape: (22, 7)
Columns: ['Group', 'Location', 'Country', 'Year', 'Population', 'Source', 'Method']
Date range: 2000 - 2021
Data preview:
   Group Location  Country  Year  Population  Source  Method
0  Tunis    Tunis  Tunisia  2000     1969032  Oxford  Oxford
1  Tunis    Tunis  Tunisia  2001     1984750  Oxford  Oxford
2  Tunis    Tunis  Tunisia  2002     2000614  Oxford  Oxford
3  Tunis    Tunis  Tunisia  2003     2016605  Oxford  Oxford
4  Tunis    Tunis  Tunisia  2004     2035590  Oxford  Oxford


Cleaned data saved to: data/processed/pg.csv
Years covered: 2000 - 2021
Total data points: 22
Population range: 1,969,032 - 2,696,439
✅ Population growth data cleaned successfully!

Cleaned data shape: (22, 3)
Cleaned data columns: ['yearName', 'population', 'populationGrowthPercentage']
Sample of cleaned data:
   yearName  population  populationGrowthPercentage
0      2000     1969032                         NaN
1      2001     1984750                      

### pas.csv preparation
### Observable Notebook functions/charts:
#### 1.) "plot_pas" / "chart_pas"

In [4]:
# POPULATION AGE SEX - pas.csv preparation for Observable Notebook plot functions/charts:
# 1.) "plot_pas"/"chart_pas" (population age sex, i.e., population by sex and age bracket, (i.e., Population Distribution by Age & Sex, xxxx))

# load "raw" (i.e. "dirty") tabular output data
raw_df_pas = pd.read_csv('data/raw/2025-04-colombia-cartagena_02-process-output_tabular_cartagena_demographics.csv')

# display basic info about the raw data
print("Raw population age structure data info:")
print(f"Shape: {raw_df_pas.shape}")
print(f"Columns: {list(raw_df_pas.columns)}")
print(f"Age groups: {sorted(raw_df_pas['age_group'].unique())}")
print(f"Sex categories: {raw_df_pas['sex'].unique()}")
print(f"Total population: {raw_df_pas['population'].sum():,.0f}")
print(f"Data preview:")
print(raw_df_pas.head())
print("\n" + "="*50 + "\n")

# clean the data using our clean_pas function
try:
    cleaned_df_pas = clean_pas('data/raw/2025-04-colombia-cartagena_02-process-output_tabular_cartagena_demographics.csv')
    print("✅ Population age structure data cleaned successfully!")
    
    # display cleaned data info
    print(f"\nCleaned data shape: {cleaned_df_pas.shape}")
    print(f"Cleaned data columns: {list(cleaned_df_pas.columns)}")
    print(f"Sample of cleaned data:")
    print(cleaned_df_pas.head(10))
    
    # basic data validation
    print(f"\nData validation:")
    print(f"- Missing values: {cleaned_df_pas.isnull().sum().sum()}")
    print(f"- Age brackets: {sorted(cleaned_df_pas['ageBracket'].unique())}")
    print(f"- Sex categories: {sorted(cleaned_df_pas['sex'].unique())}")
    print(f"- Population count range: {cleaned_df_pas['count'].min():,.0f} - {cleaned_df_pas['count'].max():,.0f}")
    print(f"- Percentage range: {cleaned_df_pas['percentage'].min():.3f}% - {cleaned_df_pas['percentage'].max():.3f}%")
    print(f"- Year: {cleaned_df_pas['yearName'].iloc[0]}")
    
    # data quality checks
    total_percentage = cleaned_df_pas['percentage'].sum()
    print(f"- Total percentage sum: {total_percentage:.3f}% (should be ~100%)")
    
    if abs(total_percentage - 100) > 0.1:
        print(f"⚠️  Warning: Percentage sum deviates from 100% by {abs(total_percentage - 100):.3f}%")
    
    # check for balanced sex representation
    sex_counts = cleaned_df_pas.groupby('sex')['count'].sum()
    print(f"- Population by sex: Female: {sex_counts.get('female', 0):,.0f}, Male: {sex_counts.get('male', 0):,.0f}")
    
    # check age bracket coverage
    expected_brackets = len(cleaned_df_pas['ageBracket'].unique())
    actual_records = len(cleaned_df_pas)
    print(f"- Age brackets: {expected_brackets}, Total records: {actual_records}")
    
    if actual_records != expected_brackets * 2:  # Should be 2 records per age bracket (male/female)
        print(f"⚠️  Note: Expected {expected_brackets * 2} records (2 per age bracket), found {actual_records}")
    
except Exception as e:
    print(f"❌ Error cleaning population age structure data: {e}")
    print("Check that the demographics CSV file exists and has the correct format")
    print("Expected columns: age_group, sex, population")

# save the cleaned data as a CSV file - pas.csv, and export
# (This is handled automatically by the clean_pas function, but confirming)
if 'cleaned_df_pas' in locals():
    print(f"\n📁 Cleaned data saved to: data/processed/pas.csv")
    print(f"✅ Ready for Observable visualization!")
    
    # quick preview of the structure for Observable
    print(f"\n📊 Data structure summary for Observable:")
    print(f"- Columns: {list(cleaned_df_pas.columns)}")
    print(f"- Records per sex: {len(cleaned_df_pas[cleaned_df_pas['sex'] == 'female'])}, {len(cleaned_df_pas[cleaned_df_pas['sex'] == 'male'])}")
    print(f"- Data types: {dict(cleaned_df_pas.dtypes)}")
else:
    print("❌ No cleaned data available to save")

Raw population age structure data info:
Shape: (36, 3)
Columns: ['age_group', 'sex', 'population']
Age groups: ['0-1', '1-4', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '5-9', '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80+']
Sex categories: ['f' 'm']
Total population: 1,319,285
Data preview:
  age_group sex    population
0       1-4   f  42167.329278
1       1-4   m  45029.741031
2       0-1   f  10386.217327
3       0-1   m  11119.716253
4       5-9   f  50745.108513


Cleaned data saved to: data/processed/pas.csv
Total population: 1,319,285
Age brackets: 17
Sex categories: 2
Total records: 34
✅ Population age structure data cleaned successfully!

Cleaned data shape: (34, 5)
Cleaned data columns: ['ageBracket', 'sex', 'count', 'percentage', 'yearName']
Sample of cleaned data:
  ageBracket     sex     count  percentage  yearName
0        0-4  female  52553.55    3.983486      2021
1        0-4    male  56149.46    4.256051      2021
2      10-14

# BUILT FORM

### uba.csv preparation
### Observable Notebook functions/charts:
#### 1.) "plot_ubaa" / "chart_ubaa" ; and
#### 2.) "plot_ubap" / "chart_ubap"

In [2]:
# URBAN BUILT AREA - uba.csv preparation for Observable Notebook plot functions/charts:
# 1.) "plot_ubaa"/"chart_ubaa" (absolute urban extenet and change)
# 2.) "plot_ubap"/"chart_ubap" (urban extent and change growth percentage)

# load "raw" (i.e. "dirty") tabular output data
raw_df_uba = pd.read_csv('data/raw/2025-04-colombia-cartagena_other_02-process-output_tabular_cartagena_other_wsf_stats.csv')

# display basic info about the raw data
print("Raw urban built area data info:")
print(f"Shape: {raw_df_uba.shape}")
print(f"Columns: {list(raw_df_uba.columns)}")
print(f"Year range: {raw_df_uba['year'].min()} - {raw_df_uba['year'].max()}")
print(f"UBA range: {raw_df_uba['cumulative sq km'].min():.2f} - {raw_df_uba['cumulative sq km'].max():.2f} sq km")
print(f"Total data points: {len(raw_df_uba)}")
print(f"Data preview:")
print(raw_df_uba.head())
print("\n" + "="*50 + "\n")

# clean the data using our clean_uba function
try:
    cleaned_df_uba = clean_uba('data/raw/2025-04-colombia-cartagena_other_02-process-output_tabular_cartagena_other_wsf_stats.csv')
    print("✅ Urban built area data cleaned successfully!")
    
    # display cleaned data info
    print(f"\nCleaned data shape: {cleaned_df_uba.shape}")
    print(f"Cleaned data columns: {list(cleaned_df_uba.columns)}")
    print(f"Sample of cleaned data:")
    print(cleaned_df_uba.head(10))
    
    # basic data validation
    print(f"\nData validation:")
    print(f"- Missing values: {cleaned_df_uba.isnull().sum().sum()}")
    print(f"- Year range: {cleaned_df_uba['yearName'].min()} - {cleaned_df_uba['yearName'].max()}")
    print(f"- UBA range: {cleaned_df_uba['uba'].min():.2f} - {cleaned_df_uba['uba'].max():.2f} sq km")
    print(f"- Growth rate range: {cleaned_df_uba['ubaGrowthPercentage'].min():.3f}% - {cleaned_df_uba['ubaGrowthPercentage'].max():.3f}%")
    print(f"- Total urban expansion: {cleaned_df_uba['uba'].max() - cleaned_df_uba['uba'].min():.2f} sq km over {cleaned_df_uba['yearName'].max() - cleaned_df_uba['yearName'].min()} years")
    
    # data quality checks
    print(f"\nUrban growth analysis:")
    # Calculate average annual growth rate
    avg_growth = cleaned_df_uba['ubaGrowthPercentage'].mean()
    print(f"- Average annual UBA growth rate: {avg_growth:.3f}%")
    
    # Check for any potential data quality issues
    if cleaned_df_uba['ubaGrowthPercentage'].isna().sum() > 0:
        print(f"⚠️  Note: {cleaned_df_uba['ubaGrowthPercentage'].isna().sum()} missing growth rate values (expected for first year)")
    
    # Check for negative growth (urban area should generally increase)
    negative_growth = cleaned_df_uba[cleaned_df_uba['ubaGrowthPercentage'] < 0]
    if len(negative_growth) > 0:
        print(f"⚠️  Warning: {len(negative_growth)} years with negative UBA growth detected")
        print(f"   Years with decline: {negative_growth['yearName'].tolist()}")
    
    # Check for extremely high growth rates (potential data errors)
    high_growth = cleaned_df_uba[cleaned_df_uba['ubaGrowthPercentage'] > 20]  # >20% annual growth
    if len(high_growth) > 0:
        print(f"⚠️  Note: {len(high_growth)} years with very high UBA growth (>20%)")
        print(f"   High growth years: {high_growth['yearName'].tolist()}")
    
except Exception as e:
    print(f"❌ Error cleaning urban built area data: {e}")
    print("Check that the UBA CSV file exists and has the correct format")
    print("Expected columns: year, cumulative sq km")

# save the cleaned data as a CSV file - uba.csv, and export
# (This is handled automatically by the clean_uba function, but confirming)
if 'cleaned_df_uba' in locals():
    print(f"\n📁 Cleaned data saved to: data/processed/uba.csv")
    print(f"✅ Ready for Observable visualization!")
    
    # quick preview of the structure for Observable
    print(f"\n📊 Data structure summary for Observable:")
    print(f"- Columns: {list(cleaned_df_uba.columns)}")
    print(f"- Time series length: {len(cleaned_df_uba)} years")
    print(f"- Data types: {dict(cleaned_df_uba.dtypes)}")
    print(f"- Urban expansion factor: {cleaned_df_uba['uba'].max() / cleaned_df_uba['uba'].min():.2f}x growth over period")
else:
    print("❌ No cleaned data available to save")

Raw urban built area data info:
Shape: (31, 2)
Columns: ['year', 'cumulative sq km']
Year range: 1985 - 2015
UBA range: 98.56 - 184.92 sq km
Total data points: 31
Data preview:
   year  cumulative sq km
0  1985         98.562692
1  1986        100.892675
2  1987        103.206772
3  1988        104.745090
4  1989        106.680565


Cleaned data saved to: data/processed/uba.csv
Years covered: 1985 - 2015
Total data points: 31
UBA range: 98.56 - 184.92 sq km
✅ Urban built area data cleaned successfully!

Cleaned data shape: (31, 4)
Cleaned data columns: ['year', 'yearName', 'uba', 'ubaGrowthPercentage']
Sample of cleaned data:
   year  yearName     uba  ubaGrowthPercentage
0     1      1985   98.56                  NaN
1     2      1986  100.89                2.364
2     3      1987  103.21                2.300
3     4      1988  104.75                1.492
4     5      1989  106.68                1.842
5     6      1990  109.99                3.103
6     7      1991  115.45            

### URBAN EXTENT AND CHANGE

In [10]:
# generate urban extent and change data for Tunis, Tunisia given "tabular" output from City Scan GCP process
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)
uba = [
      { "year": 1, "yearName": "1985", "uba": 166.08101814331798},
      { "year": 2, "yearName": "1986", "uba": 172.44786586480038},
      { "year": 3, "yearName": "1987", "uba": 185.11900630951914},
      { "year": 4, "yearName": "1988", "uba": 202.44611527369096},
      { "year": 5, "yearName": "1989", "uba": 209.3484337801595},
      { "year": 6, "yearName": "1990", "uba": 218.8655512133127},
      { "year": 7, "yearName": "1991", "uba": 226.97080233605266},
      { "year": 8, "yearName": "1992", "uba": 231.32775796388964},
      { "year": 9, "yearName": "1993", "uba": 235.31000915223282},
      { "year": 10, "yearName": "1994", "uba": 239.59690318203425},
      { "year": 11, "yearName": "1995", "uba": 244.28727695052265},
      { "year": 12, "yearName": "1996", "uba": 249.05396781687122},
      { "year": 13, "yearName": "1997", "uba": 253.29457114797057},
      { "year": 14, "yearName": "1998", "uba": 256.0263479213753},
      { "year": 15, "yearName": "1999", "uba": 258.9113844404829},
      { "year": 16, "yearName": "2000", "uba": 261.66818321318607},
      { "year": 17, "yearName": "2001", "uba": 264.4919158340125},
      { "year": 18, "yearName": "2002", "uba": 267.0854460612935},
      { "year": 19, "yearName": "2003", "uba": 270.3870988687197},
      { "year": 20, "yearName": "2004", "uba": 276.21722470525106},
      { "year": 21, "yearName": "2005", "uba": 281.41742170944474},
      { "year": 22, "yearName": "2006", "uba": 288.76825955333743},
      { "year": 23, "yearName": "2007", "uba": 294.66531923799204},
      { "year": 24, "yearName": "2008", "uba": 301.9654875333054},
      { "year": 25, "yearName": "2009", "uba": 310.2853023000293},
      { "year": 26, "yearName": "2010", "uba": 317.42908309972756},
      { "year": 27, "yearName": "2011", "uba": 322.2602056142696},
      { "year": 28, "yearName": "2012", "uba": 327.013134381004},
      { "year": 29, "yearName": "2013", "uba": 332.55550722560355},
      { "year": 30, "yearName": "2014", "uba": 337.01380195059915},
      { "year": 31, "yearName": "2015", "uba": 341.3507399789974}
]

# convert uba list to dataframe, uba_df
uba_df = pd.DataFrame(uba)

# create output CSV of df for plotting
uba_output_df = pd.DataFrame({
    'year': uba_df['year'],
    'yearName': uba_df['yearName'],
    'uba': uba_df['uba'].round(2),  # round the count to 2 decimal places
})

# calculate uba growth rate as a percentage for each year and round it to 3 decimal places
uba_output_df['ubaGrowthPercentage'] = uba_output_df['uba'].pct_change() * 100
uba_output_df['ubaGrowthPercentage'] = uba_output_df['ubaGrowthPercentage'].round(3)

# save uba_output_df for uba data to CSV
uba_output_df.to_csv('data/processed/uba.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [11]:
# uba data check
print("\nFirst 10 rows of the output:")
print(uba_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(uba_output_df)}")
print(f"Year names: {uba_output_df['yearName'].unique()}")
print(f"UBA values: {uba_output_df['uba'].unique()}")


First 10 rows of the output:
   year yearName     uba  ubaGrowthPercentage
0     1     1985  166.08                  NaN
1     2     1986  172.45                3.836
2     3     1987  185.12                7.347
3     4     1988  202.45                9.361
4     5     1989  209.35                3.408
5     6     1990  218.87                4.547
6     7     1991  226.97                3.701
7     8     1992  231.33                1.921
8     9     1993  235.31                1.720
9    10     1994  239.60                1.823

Total number of records: 31
Year names: ['1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015']
UBA values: [166.08 172.45 185.12 202.45 209.35 218.87 226.97 231.33 235.31 239.6
 244.29 249.05 253.29 256.03 258.91 261.67 264.49 267.09 270.39 276.22
 281.42 288.77 294.67 301.97 310.29 317.43 322.2

### DENSITY & POPULATION-URBAN GROWTH RATIO


In [12]:
# generate pug data (population urban growth) and population-urban growth for Tunis, Tunisia given pg.csv genergated via df and uba.csv generated via uba_output_df

# Note: the poulation growth rate / urban growth rate ratio can only be calculated if the yearName is the same in both pg.csv and uba.csv (i.e., for dataset years that overlap, 2000-2015 in the case of Tunis, Tunisia) - can we call MORE Oxford/WorldPop data for the years 1985-onward for more information regaridng this ratio (i.e., given that the UBA data goes back to 1985)?

# read pg.csv and uba.csv
pg_df = pd.read_csv('data/processed/pg.csv')
uba_df = pd.read_csv('data/processed/uba.csv')

# merge pg_df and uba_df on yearName to create pug
pug_df = pd.merge(pg_df, uba_df, on='yearName', how='inner')

# calculate density
pug_df['density'] = pug_df['population'] / pug_df['uba']
pug_df['density'] = pug_df['density'].round(3)

# calculate population-urban growth percentage ratio
pug_df['populationUrbanGrowthRatio'] = pug_df['populationGrowthPercentage'] / pug_df['ubaGrowthPercentage']
pug_df['populationUrbanGrowthRatio'] = pug_df['populationUrbanGrowthRatio'].round(3)


# save pug_df for population urban growth data to CSV
pug_df.to_csv('data/processed/pug.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [13]:
# pug data check
print("\nFirst 10 rows of the output:")
print(pug_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(pug_df)}")
print(f"Year names: {pug_df['yearName'].unique()}")
print(f"Population urban growth values: {pug_df['populationUrbanGrowthRatio'].unique()}")


First 10 rows of the output:
   yearName  population  populationGrowthPercentage  year     uba  \
0      2000     1969032                         NaN    16  261.67   
1      2001     1984750                       0.798    17  264.49   
2      2002     2000614                       0.799    18  267.09   
3      2003     2016605                       0.799    19  270.39   
4      2004     2035590                       0.941    20  276.22   
5      2005     2070274                       1.704    21  281.42   
6      2006     2105593                       1.706    22  288.77   
7      2007     2141514                       1.706    23  294.67   
8      2008     2178094                       1.708    24  301.97   
9      2009     2215198                       1.704    25  310.29   

   ubaGrowthPercentage   density  populationUrbanGrowthRatio  
0                1.066  7524.867                         NaN  
1                1.078  7504.064                       0.740  
2                0.98

### LAND COVER (need alternative to donut chart - do tree map)


In [14]:
# use the clean_land_cover_csv function from lc_cleanup.py to clean the tabular-output land cover csv file so that it can be plotted
clean_land_cover_csv('data/raw/2025-02-tunisia-tunis_02-process-output_tabular_tunis_lc.csv', 'data/processed/')


NameError: name 'clean_land_cover_csv' is not defined

In [29]:
# land cover data for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

## This needs to be automated given the tabular-output from the GCP - hard coded for now
# (edit lc so that percentage is calculated from pixelCount and pixelTotal)
lc = [
      { "lcType": "Built up", "pixelCount": 3438092.2117647, "pixelTotal": 6068006.37255, "percentage": 56.66 },
      { "lcType": "Grassland", "pixelCount": 1022591.69411765, "pixelTotal": 6068006.37255, "percentage": 16.85 },
      { "lcType": "Permanent water bodies", "pixelCount": 563266.486274511, "pixelTotal": 6068006.37255, "percentage": 9.28},
      { "lcType": "Tree cover", "pixelCount": 385035.090196078, "pixelTotal": 6068006.37255, "percentage": 6.345 },
      { "lcType": "Cropland", "pixelCount": 346731.71372549, "pixelTotal": 6068006.37255, "percentage": 5.71},
      { "lcType": "Bare sparse vegetation", "pixelCount": 168537.729411765, "pixelTotal": 6068006.37255, "percentage": 2.78},
      { "lcType": "Shrubland", "pixelCount": 15153.5294117647, "pixelTotal": 6068006.37255, "percentage": 2.49},
      { "lcType": "Herbaceous wetland", "pixelCount": 128597.917647059, "pixelTotal": 6068006.37255, "percentage": 2.12},
      { "lcType": "Snow and ice", "pixelCount": 0, "pixelTotal": 6068006.37255, "percentage": 0},
      { "lcType": "Mangroves", "pixelCount": 0, "pixelTotal": 6068006.37255, "percentage": 0},
      { "lcType": "Moss and lichens", "pixelCount": 0, "pixelTotal": 6068006.37255, "percentage": 0},
]

# convert lc list to dataframe, lc_df
lc_df = pd.DataFrame(lc)

# create output CSV of lc_df for plotting
lc_output_df = pd.DataFrame({
    'lcType': lc_df['lcType'],
    'pixelCount': lc_df['pixelCount'],
    'pixelTotal': lc_df['pixelTotal'], 
    'percentage': lc_df['percentage']
})

# save lc_output_df for lc data to CSV
lc_output_df.to_csv('data/processed/lc.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


# 6 CLIMATE CONDITIONS

### PHOTOVOLTAIC POWER POTENTIAL


In [52]:
# generate photovoltaic power potential data (i.e., seasonal availa bility of solar energy, plotting the "max" value ) for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

pv = [
      { "month": 1, "monthName": "Jan", "max": 3.31, "min": 3.04, "mean": 3.20},
      { "month": 2, "monthName": "Feb", "max": 3.94, "min": 3.72, "mean": 3.84},
      { "month": 3, "monthName": "Mar", "max": 4.53, "min": 4.32, "mean": 4.44},
      { "month": 4, "monthName": "Apr", "max": 4.87, "min": 4.70, "mean": 4.79},
      { "month": 5, "monthName": "May", "max": 5.17, "min": 4.99, "mean": 5.09},
      { "month": 6, "monthName": "Jun", "max": 5.47, "min": 5.30, "mean": 5.39},
      { "month": 7, "monthName": "Jul", "max": 5.68, "min": 5.52, "mean": 5.60},
      { "month": 8, "monthName": "Aug", "max": 5.38, "min": 5.25, "mean": 5.31},
      { "month": 9, "monthName": "Sep", "max": 4.59, "min": 4.40, "mean": 4.52},
      { "month": 10, "monthName": "Oct", "max": 4.11, "min": 3.91, "mean": 4.03},
      { "month": 11, "monthName": "Nov", "max": 3.44, "min": 3.13, "mean": 3.32},
      { "month": 12, "monthName": "Dec", "max": 3.14, "min": 2.84, "mean": 3.03}
]

# convert pv list to dataframe, pv_df
pv_df = pd.DataFrame(pv)

# create output CSV of pv_df for plotting
pv_output_df = pd.DataFrame({
    'month': pv_df['month'],
    'monthName': pv_df['monthName'],
    'maxPv': pv_df['max'].round(2),  # round the count to 2 decimal places
})

# save pv_output_df for pv data to CSV
pv_output_df.to_csv('data/processed/pv.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [53]:
# pv data check
print("\nFirst 10 rows of the output:")
print(pv_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(pv_output_df)}")
print(f"Month names: {pv_output_df['monthName'].unique()}")
print(f"PV values: {pv_output_df['maxPv'].unique()}")


First 10 rows of the output:
   month monthName  maxPv
0      1       Jan   3.31
1      2       Feb   3.94
2      3       Mar   4.53
3      4       Apr   4.87
4      5       May   5.17
5      6       Jun   5.47
6      7       Jul   5.68
7      8       Aug   5.38
8      9       Sep   4.59
9     10       Oct   4.11

Total number of records: 12
Month names: ['Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun' 'Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec']
PV values: [3.31 3.94 4.53 4.87 5.17 5.47 5.68 5.38 4.59 4.11 3.44 3.14]


# 7 RISK IDENTIFICATION

### URBAN BUILT-UP AREA EXPOSED TO RIVER FLOODING


In [23]:
# generate built-up area exposed to river flooding data (i.e., built-up area exposed to fluvial flooding) for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

fu = [
      { "year": 1, "yearName": "1985", "fu": 3.351537941381818},
      { "year": 2, "yearName": "1986", "fu": 3.4746915665244567},
      { "year": 3, "yearName": "1987", "fu": 3.6828798375989176},
      { "year": 4, "yearName": "1988", "fu": 3.9775688691902324},
      { "year": 5, "yearName": "1989", "fu": 4.116849754768217},
      { "year": 6, "yearName": "1990", "fu": 4.237071150740793},
      { "year": 7, "yearName": "1991", "fu": 4.441594135352676},
      { "year": 8, "yearName": "1992", "fu": 4.598468395951038},
      { "year": 9, "yearName": "1993", "fu": 4.801525265977889},
      { "year": 10, "yearName": "1994", "fu": 5.022908568317633},
      { "year": 11, "yearName": "1995", "fu": 5.185647287256121},
      { "year": 12, "yearName": "1996", "fu": 5.427556193786304},
      { "year": 13, "yearName": "1997", "fu": 5.631346121105671},
      { "year": 14, "yearName": "1998", "fu": 5.777957579608812},
      { "year": 15, "yearName": "1999", "fu": 6.023531772601575},
      { "year": 16, "yearName": "2000", "fu": 6.172342402982264},
      { "year": 17, "yearName": "2001", "fu": 6.3563397834037065},
      { "year": 18, "yearName": "2002", "fu": 6.53667187736257},
      { "year": 19, "yearName": "2003", "fu": 6.648829643117474},
      { "year": 20, "yearName": "2004", "fu": 6.747059320314579},
      { "year": 21, "yearName": "2005", "fu": 6.824763393321244},
      { "year": 22, "yearName": "2006", "fu": 6.928857528858474},
      { "year": 23, "yearName": "2007", "fu": 7.034417778980736},
      { "year": 24, "yearName": "2008", "fu": 7.114321023864949},
      { "year": 25, "yearName": "2009", "fu": 7.191292039579098},
      { "year": 26, "yearName": "2010", "fu": 7.28658948760614},
      { "year": 27, "yearName": "2011", "fu": 7.3327720970346295},
      { "year": 28, "yearName": "2012", "fu": 7.450061263837143},
      { "year": 29, "yearName": "2013", "fu": 7.5050405607758215},
      { "year": 30, "yearName": "2014", "fu": 7.6149991546531774},
      { "year": 31, "yearName": "2015", "fu": 7.672177623469403},  

]

# convert fu list to dataframe, fu_df
fu_df = pd.DataFrame(fu)

# create output CSV of fu_df for plotting
fu_output_df = pd.DataFrame({
    'year': fu_df['year'],
    'yearName': fu_df['yearName'],
    'fu': fu_df['fu'].round(2),  # round the count to 2 decimal places
})

# save fu_output_df for fu data to CSV
fu_output_df.to_csv('data/processed/fu.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [24]:
# fu data check
print("\nFirst 10 rows of the output:")
print(fu_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(fu_output_df)}")
print(f"Year names: {fu_output_df['yearName'].unique()}")
print(f"fu values: {fu_output_df['fu'].unique()}")


First 10 rows of the output:
   year yearName    fu
0     1     1985  3.35
1     2     1986  3.47
2     3     1987  3.68
3     4     1988  3.98
4     5     1989  4.12
5     6     1990  4.24
6     7     1991  4.44
7     8     1992  4.60
8     9     1993  4.80
9    10     1994  5.02

Total number of records: 31
Year names: ['1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015']
fu values: [3.35 3.47 3.68 3.98 4.12 4.24 4.44 4.6  4.8  5.02 5.19 5.43 5.63 5.78
 6.02 6.17 6.36 6.54 6.65 6.75 6.82 6.93 7.03 7.11 7.19 7.29 7.33 7.45
 7.51 7.61 7.67]


### URBAN BUILT-UP AREA EXPOSED TO RAINWATER FLOODING


In [25]:
# generate built-up area exposed to rainwater flooding data (i.e., built-up area exposed to pluvial flooding) for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

pu = [
      { "year": 1, "yearName": "1985", "pu": 29.78265168032819},
      { "year": 2, "yearName": "1986", "pu": 31.410038869713063},
      { "year": 3, "yearName": "1987", "pu": 33.22875401244453},
      { "year": 4, "yearName": "1988", "pu": 34.975629540509466},
      { "year": 5, "yearName": "1989", "pu": 36.07668159386806},
      { "year": 6, "yearName": "1990", "pu": 37.00913046994804},
      { "year": 7, "yearName": "1991", "pu": 38.23993366408192},
      { "year": 8, "yearName": "1992", "pu": 39.27867584757668},
      { "year": 9, "yearName": "1993", "pu": 40.277099879983076},
      { "year": 10, "yearName": "1994", "pu": 41.435330402157895},
      { "year": 11, "yearName": "1995", "pu": 42.235095908292536},
      { "year": 12, "yearName": "1996", "pu": 43.18880344585547},
      { "year": 13, "yearName": "1997", "pu": 44.074336655214445},
      { "year": 14, "yearName": "1998", "pu": 44.960602921865934},
      { "year": 15, "yearName": "1999", "pu": 45.93630217820434},
      { "year": 16, "yearName": "2000", "pu": 46.66862641342754},
      { "year": 17, "yearName": "2001", "pu": 47.94121387323481},
      { "year": 18, "yearName": "2002", "pu": 49.155156749640824},
      { "year": 19, "yearName": "2003", "pu": 50.08247422467319},
      { "year": 20, "yearName": "2004", "pu": 50.62566967842733},
      { "year": 21, "yearName": "2005", "pu": 51.11975029358292},
      { "year": 22, "yearName": "2006", "pu": 51.73405230471108},
      { "year": 23, "yearName": "2007", "pu": 52.47737239932201},
      { "year": 24, "yearName": "2008", "pu": 53.22948918144312},
      { "year": 25, "yearName": "2009", "pu": 53.95374978644864},
      { "year": 26, "yearName": "2010", "pu": 54.567318740284286},
      { "year": 27, "yearName": "2011", "pu": 54.98369528243321},
      { "year": 28, "yearName": "2012", "pu": 55.5386196528676},
      { "year": 29, "yearName": "2013", "pu": 56.0422300128259},
      { "year": 30, "yearName": "2014", "pu": 56.627209732253434},
      { "year": 31, "yearName": "2015", "pu": 56.92336487842978},  

]

# convert pu list to dataframe, pu_df
pu_df = pd.DataFrame(pu)

# create output CSV of pu_df for plotting
pu_output_df = pd.DataFrame({
    'year': pu_df['year'],
    'yearName': pu_df['yearName'],
    'pu': pu_df['pu'].round(2),  # round the count to 2 decimal places
})

# save pu_output_df for pu data to CSV
pu_output_df.to_csv('data/processed/pu.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [26]:
# pu data check
print("\nFirst 10 rows of the output:")
print(pu_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(pu_output_df)}")
print(f"Year names: {pu_output_df['yearName'].unique()}")
print(f"pu values: {pu_output_df['pu'].unique()}")


First 10 rows of the output:
   year yearName     pu
0     1     1985  29.78
1     2     1986  31.41
2     3     1987  33.23
3     4     1988  34.98
4     5     1989  36.08
5     6     1990  37.01
6     7     1991  38.24
7     8     1992  39.28
8     9     1993  40.28
9    10     1994  41.44

Total number of records: 31
Year names: ['1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015']
pu values: [29.78 31.41 33.23 34.98 36.08 37.01 38.24 39.28 40.28 41.44 42.24 43.19
 44.07 44.96 45.94 46.67 47.94 49.16 50.08 50.63 51.12 51.73 52.48 53.23
 53.95 54.57 54.98 55.54 56.04 56.63 56.92]


### URBAN BUILT-UP AREA EXPOSED TO COASTAL FLOODING


In [27]:
# generate built-up area exposed to coastal flooding data for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

cu = [
      { "year": 1, "yearName": "1985", "cu": 2.2739437213837266},
      { "year": 2, "yearName": "1986", "cu": 2.5451749196145386},
      { "year": 3, "yearName": "1987", "cu": 2.833266435573212},
      { "year": 4, "yearName": "1988", "cu": 2.955687003423335},
      { "year": 5, "yearName": "1989", "cu": 3.063446425423144},
      { "year": 6, "yearName": "1990", "cu": 3.1396843838447777},
      { "year": 7, "yearName": "1991", "cu": 3.2349818318718198},
      { "year": 8, "yearName": "1992", "cu": 3.317084248633579},
      { "year": 9, "yearName": "1993", "cu": 3.3815932903749615},
      { "year": 10, "yearName": "1994", "cu": 3.4636957071367207},
      { "year": 11, "yearName": "1995", "cu": 3.53040392075565},
      { "year": 12, "yearName": "1996", "cu": 3.600777420837158},
      { "year": 13, "yearName": "1997", "cu": 3.6652864625785404},
      { "year": 14, "yearName": "1998", "cu": 3.7437235928777213},
      { "year": 15, "yearName": "1999", "cu": 3.811164863789166},
      { "year": 16, "yearName": "2000", "cu": 3.8639449888502972},
      { "year": 17, "yearName": "2001", "cu": 3.942382119149478},
      { "year": 18, "yearName": "2002", "cu": 4.03254816612891},
      { "year": 19, "yearName": "2003", "cu": 4.176960452754504},
      { "year": 20, "yearName": "2004", "cu": 4.2385372653258235},
      { "year": 21, "yearName": "2005", "cu": 4.319906624795067},
      { "year": 22, "yearName": "2006", "cu": 4.385881781121481},
      { "year": 23, "yearName": "2007", "cu": 4.469450312468272},
      { "year": 24, "yearName": "2008", "cu": 4.6079981407537405},
      { "year": 25, "yearName": "2009", "cu": 4.754609599256882},
      { "year": 26, "yearName": "2010", "cu": 4.844042588943799},
      { "year": 27, "yearName": "2011", "cu": 4.96499704220889},
      { "year": 28, "yearName": "2012", "cu": 5.050764745433228},
      { "year": 29, "yearName": "2013", "cu": 5.168053912235742},
      { "year": 30, "yearName": "2014", "cu": 5.225232381051967},
      { "year": 31, "yearName": "2015", "cu": 5.311733141568821},  

]

# convert cu list to dataframe, cu_df
cu_df = pd.DataFrame(cu)

# create output CSV of cu_df for plotting
cu_output_df = pd.DataFrame({
    'year': cu_df['year'],
    'yearName': cu_df['yearName'],
    'cu': cu_df['cu'].round(2),  # round the count to 2 decimal places
})

# save cu_output_df for cu data to CSV
cu_output_df.to_csv('data/processed/cu.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [28]:
# cu data check
print("\nFirst 10 rows of the output:")
print(cu_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(cu_output_df)}")
print(f"Year names: {cu_output_df['yearName'].unique()}")
print(f"cu values: {cu_output_df['cu'].unique()}")


First 10 rows of the output:
   year yearName    cu
0     1     1985  2.27
1     2     1986  2.55
2     3     1987  2.83
3     4     1988  2.96
4     5     1989  3.06
5     6     1990  3.14
6     7     1991  3.23
7     8     1992  3.32
8     9     1993  3.38
9    10     1994  3.46

Total number of records: 31
Year names: ['1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015']
cu values: [2.27 2.55 2.83 2.96 3.06 3.14 3.23 3.32 3.38 3.46 3.53 3.6  3.67 3.74
 3.81 3.86 3.94 4.03 4.18 4.24 4.32 4.39 4.47 4.61 4.75 4.84 4.96 5.05
 5.17 5.23 5.31]


### URBAN BUILT-UP AREA EXPOSED TO COMBINED RIVER, RAINWATER, AND COASTAL FLOODING


In [29]:
# generate built-up area exposed to combined flooding data for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

comb = [
      { "year": 1, "yearName": "1985", "comb": 32.76106345981951},
      { "year": 2, "yearName": "1986", "comb": 34.641355415122305},
      { "year": 3, "yearName": "1987", "comb": 36.74669595922742},
      { "year": 4, "yearName": "1988", "comb": 38.72814982089738},
      { "year": 5, "yearName": "1989", "comb": 39.97801250463666},
      { "year": 6, "yearName": "1990", "comb": 41.01895386000896},
      { "year": 7, "yearName": "1991", "comb": 42.41176271578881},
      { "year": 8, "yearName": "1992", "comb": 43.57512463901124},
      { "year": 9, "yearName": "1993", "comb": 44.71063038511807},
      { "year": 10, "yearName": "1994", "comb": 46.004476506408295},
      { "year": 11, "yearName": "1995", "comb": 46.92373035122299},
      { "year": 12, "yearName": "1996", "comb": 48.01525265977888},
      { "year": 13, "yearName": "1997", "comb": 49.01514280677031},
      { "year": 14, "yearName": "1998", "comb": 50.03116021419708},
      { "year": 15, "yearName": "1999", "comb": 51.17326347593655},
      { "year": 16, "yearName": "2000", "comb": 52.02727522171735},
      { "year": 17, "yearName": "2001", "comb": 53.43621133793255},
      { "year": 18, "yearName": "2002", "comb": 54.78357064157642},
      { "year": 19, "yearName": "2003", "comb": 55.887554924105075},
      { "year": 20, "yearName": "2004", "comb": 56.53850979985902},
      { "year": 21, "yearName": "2005", "comb": 57.14034983701442},
      { "year": 22, "yearName": "2006", "comb": 57.87780547328522},
      { "year": 23, "yearName": "2007", "comb": 58.721554416970804},
      { "year": 24, "yearName": "2008", "comb": 59.591693423186946},
      { "year": 25, "yearName": "2009", "comb": 60.43690848145756},
      { "year": 26, "yearName": "2010", "comb": 61.1435757114427},
      { "year": 27, "yearName": "2011", "comb": 61.66038110266628},
      { "year": 28, "yearName": "2012", "comb": 62.301073176325005},
      { "year": 29, "yearName": "2013", "comb": 62.89631569784776},
      { "year": 30, "yearName": "2014", "comb": 63.53700777150649},
      { "year": 31, "yearName": "2015", "comb": 63.918197563614655},  

]

# convert comb list to dataframe, comb_df
comb_df = pd.DataFrame(comb)

# create output CSV of comb_df for plotting
comb_output_df = pd.DataFrame({
    'year': comb_df['year'],
    'yearName': comb_df['yearName'],
    'comb': comb_df['comb'].round(2),  # round the count to 2 decimal places
})

# save comb_output_df for comb data to CSV
comb_output_df.to_csv('data/processed/comb.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [30]:
# comb data check
print("\nFirst 10 rows of the output:")
print(comb_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(comb_output_df)}")
print(f"Year names: {comb_output_df['yearName'].unique()}")
print(f"comb values: {comb_output_df['comb'].unique()}")


First 10 rows of the output:
   year yearName   comb
0     1     1985  32.76
1     2     1986  34.64
2     3     1987  36.75
3     4     1988  38.73
4     5     1989  39.98
5     6     1990  41.02
6     7     1991  42.41
7     8     1992  43.58
8     9     1993  44.71
9    10     1994  46.00

Total number of records: 31
Year names: ['1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015']
comb values: [32.76 34.64 36.75 38.73 39.98 41.02 42.41 43.58 44.71 46.   46.92 48.02
 49.02 50.03 51.17 52.03 53.44 54.78 55.89 56.54 57.14 57.88 58.72 59.59
 60.44 61.14 61.66 62.3  62.9  63.54 63.92]


### ELEVATION (need alternative to donut chart)


In [31]:
# elevation data for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

## This needs to be automated given the tabular-output from the GCP - hard coded for now

elevation = [
      { "bin": "-5-40m", "count": 413599, "total": 549697, "percentage": 75.24},
      { "bin": "40-90m", "count": 94379, "total": 549697, "percentage": 17.17 },
      { "bin": "90-135m", "count": 32786 , "total": 549697, "percentage": 5.96 },
      { "bin": "135-185m", "count": 8043, "total": 549697, "percentage": 1.46 },
      { "bin": "135-235", "count": 890, "total": 549697, "percentage": 0.16 },
]

# convert elevation list to dataframe, elevation_df
elevation_df = pd.DataFrame(elevation)

# create output CSV of elevation_df for plotting
elevation_output_df = pd.DataFrame({
    'bin': elevation_df['bin'],
    'count': elevation_df['count'],
    'total': elevation_df['total'], 
    'percentage': elevation_df['percentage']
})

# save elevation_output_df for elevation data to CSV
elevation_output_df.to_csv('data/processed/elevation.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


### SLOPE (need alternative to donut chart)


In [32]:
# slope data for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

## This needs to be automated given the tabular-output from the GCP - hard coded for now

slope = [
      { "bin": "0-2", "count": 428343, "total": 549702, "percentage": 77.92 },
      { "bin": "2-5", "count": 79034, "total": 549702, "percentage": 14.38 },
      { "bin": "5-10", "count": 31121, "total": 549702, "percentage": 5.66 },
      { "bin": "10-20", "count": 10147, "total": 549702, "percentage": 1.85 },
      { "bin": "20+", "count": 1057, "total": 549702, "percentage": 0.19 },
]

# convert slope list to dataframe, slope_df
slope_df = pd.DataFrame(slope)

# create output CSV of slope_df for plotting
slope_output_df = pd.DataFrame({
    'bin': slope_df['bin'],
    'count': slope_df['count'],
    'total': slope_df['total'], 
    'percentage': slope_df['percentage']
})

# save slope_output_df for slope data to CSV
slope_output_df.to_csv('data/processed/slope.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


### EARTHQUAKE EVENTS


In [37]:
# generate earthquake event data for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

# manually added earthquake event data for Tunis, Tunisia
ee = [
    {"begin_year": 1903, "distance": 316, "eqMagnitude": 5.3, "severity": "Moderate", "text": "MAY 1903; MNA; 316 km away; NA damage", "line1": "MAY 1903", "line2": "M5.3, 316 km away", "line3": "Palermo"},
    {"begin_year": 1906, "distance": 335, "eqMagnitude": 5.6, "severity": "Large", "text": "SEPTEMBER 1906; MNA; 336 km away; NA damage", "line1": "SEPTEMBER 1906", "line2": "M5.6, 335 km away", "line3": "Sicily"},
    {"begin_year": 1907, "distance": 335, "eqMagnitude": 5.1, "severity": "Moderate", "text": "FEBRUARY 1907; MNA; 336 km away; NA damage", "line1": "FEBRUARY 1907", "line2": "M5.1, 335 km away", "line3": "Sicily"},
    {"begin_year": 1908, "distance": 497, "eqMagnitude": 7.0, "severity": "Very Large", "text": "DECEMBER 1908; M7; 498 km away; Extreme damage; 78,000 fatalities", "line1": "DECEMBER 1908", "line2": "M7.0, 498 km away, 78,000 fatalities", "line3": "Messina, Sicily, Calabria"},
    {"begin_year": 1909, "distance": 452, "eqMagnitude": 5.3, "severity": "Moderate", "text": "OCTOBER 1909; MNA; 453 km away; NA damage", "line1": "OCTOBER 1909", "line2": "M5.3, 453 km away", "line3": "Sicily"},
    {"begin_year": 1911, "distance": 652, "eqMagnitude": 4.3, "severity": "Small", "text": "OCTOBER 1911; M4.3; 453 km away; NA damage", "line1": "OCTOBER 1911", "line2": "M4.3, 453 km away", "line3": "Etna"},
    {"begin_year": 1914, "distance": 452, "eqMagnitude": 4.9, "severity": "Small", "text": "MAY 1914; M4.9; 453 km away; Severe damage; 120 fatalities", "line1": "MAY 1914", "line2": "M4.9, 453 km away, 120 fatalities", "line3": "Catania, Etna"},
    {"begin_year": 1916, "distance": 494, "eqMagnitude": 5.1, "severity": "Moderate", "text": "JULY 1916; M5.1; 494 km away; Limited damage", "line1": "JULY 1916", "line2": "M5.1, 494 km away", "line3": "Stromboli Island"},
    {"begin_year": 1924, "distance": 489, "eqMagnitude": 5.6, "severity": "Large", "text": "MARCH 1924; M5.6; 489 km away; NA damage", "line1": "MARCH 1924", "line2": "M5.6, 489 km away", "line3": "Batna"},
    {"begin_year": 1926, "distance": 451, "eqMagnitude": 5.3, "severity": "Moderate", "text": "AUGUST 1926; M5.3; 451 km away; Severe damage", "line1": "AUGUST 1926", "line2": "M5.3, 451 km away", "line3": "Salina Island"},
    {"begin_year": 1930, "distance": 434, "eqMagnitude": 5.0, "severity": "Moderate", "text": "MARCH 1930; MNA; 434 km away; Moderate damage", "line1": "MARCH 1930", "line2": "M5.0, 434 km away", "line3": "Filicudi Island"},
    {"begin_year": 1931, "distance": 444, "eqMagnitude": 5.0, "severity": "Moderate", "text": "JULY 1931; MNA; 444 km away; NA damage", "line1": "JULY 1931", "line2": "M5.0, 444 km away", "line3": "Sicily"},
    {"begin_year": 1933, "distance": 256, "eqMagnitude": 5.0, "severity": "Moderate", "text": "FEBRUARY 1933; MNA; 256 km away; NA damage", "line1": "FEBRUARY 1933", "line2": "M5.0, 256 km away", "line3": "Sicily"},
    {"begin_year": 1939, "distance": 437, "eqMagnitude": 5.0, "severity": "Moderate", "text": "JANUARY 1939; MNA; 438 km away; NA damage", "line1": "JANUARY 1939", "line2": "M5.0, 438 km away", "line3": "Calabria"},
    {"begin_year": 1940, "distance": 323, "eqMagnitude": 4.8, "severity": "Small", "text": "JANUARY 1940; M4.8; 324 km away; NA damage", "line1": "JANUARY 1940", "line2": "M4.8, 324 km away", "line3": "NA"},
    {"begin_year": 1941, "distance": 240, "eqMagnitude": 5.0, "severity": "Moderate", "text": "MARCH 1941; MNA; 241 km away; NA damage", "line1": "MARCH 1941", "line2": "M5.0, 241 km away", "line3": "Calabria"},
    {"begin_year": 1946, "distance": 481, "eqMagnitude": 5.6, "severity": "Large", "text": "FEBRUARY 1946; M5.6; 481 km away; Severe damage; 264 fatalities", "line1": "FEBRUARY 1946", "line2": "M5.6, 481 km away, 264 fatalities", "line3": "Hodna Mountains"},
    {"begin_year": 1947, "distance": 196, "eqMagnitude": 5.3, "severity": "Moderate", "text": "AUGUST 1947; M5.3; 197 km away; Moderate damage; 3 fatalities", "line1": "AUGUST 1947", "line2": "M5.3, 197 km away, 3 fatalities", "line3": "NA"},
    {"begin_year": 1957, "distance": 135, "eqMagnitude": 5.6, "severity": "Large", "text": "FEBRUARY 1957; M5.6; 135 km away; Moderate damage; 13 fatalities", "line1": "FEBRUARY 1957", "line2": "M5.6, 135 km away, 13 fatalities", "line3": "Sidi Abd,Sidi Toul"},
    {"begin_year": 1961, "distance": 468, "eqMagnitude": 5.0, "severity": "Moderate", "text": "MARCH 1961; MNA; 469 km away; Moderate damage; 15 fatalities", "line1": "MARCH 1961", "line2": "M5.0, 469 km away, 15 fatalities", "line3": "Sicily"},
    {"begin_year": 1962, "distance": 141, "eqMagnitude": 5.3, "severity": "Moderate", "text": "FEBRUARY 1962; M5.3; 141 km away; Moderate damage", "line1": "FEBRUARY 1962", "line2": "M5.3, 141 km away", "line3": "Gafour,OUM-Zid,EL Akhouат"},
    {"begin_year": 1968, "distance": 282, "eqMagnitude": 6.0, "severity": "Very Large", "text": "JANUARY 1968; M6; 283 km away; Extreme damage; 216 fatalities", "line1": "JANUARY 1968", "line2": "M6.0, 283 km away, 216 fatalities", "line3": "Sicily"},
    {"begin_year": 1968, "distance": 466, "eqMagnitude": 4.9, "severity": "Small", "text": "FEBRUARY 1968; M4.9; 466 km away; Moderate damage; 1 fatality", "line1": "FEBRUARY 1968", "line2": "M4.9, 466 km away, 1 fatality", "line3": "El Asn (BABORD)"},
    {"begin_year": 1975, "distance": 446, "eqMagnitude": 4.3, "severity": "Small", "text": "JULY 1975; M4.3; 446 km away; Moderate damage; 1 fatality", "line1": "JULY 1975", "line2": "M4.3, 446 km away, 1 fatality", "line3": "Djebel Babor"},
    {"begin_year": 1978, "distance": 462, "eqMagnitude": 5.7, "severity": "Large", "text": "APRIL 1978; M5.7; 463 km away; Moderate damage; 5 fatalities", "line1": "APRIL 1978", "line2": "M5.7, 463 km away, 5 fatalities", "line3": "Sicily"},
    {"begin_year": 1990, "distance": 467, "eqMagnitude": 5.3, "severity": "Moderate", "text": "DECEMBER 1990; M5.3; 468 km away; Extreme damage; 19 fatalities", "line1": "DECEMBER 1990", "line2": "M5.3, 468 km away, 19 fatalities", "line3": "Sicily: Carlentini"},
    {"begin_year": 2002, "distance": 553, "eqMagnitude": 6.0, "severity": "Very Large", "text": "SEPTEMBER 2002; M6; 354 km away; Extreme damage; 2 fatalities", "line1": "SEPTEMBER 2002", "line2": "M6.0, 354 km away, 2 fatalities", "line3": "Sicily: Palermo"},
    {"begin_year": 2018, "distance": 442, "eqMagnitude": 5.0, "severity": "Moderate", "text": "DECEMBER 2018; M5; 443 km away; Extreme damage", "line1": "DECEMBER 2018", "line2": "M5.0, 443 km away", "line3": "Sicily: Catpana"},
    {"begin_year": 2021, "distance": 444, "eqMagnitude": 6.0, "severity": "Very Large", "text": "MARCH 2021; M6; 445 km away; Limited damage", "line1": "MARCH 2021", "line2": "M6.0, 445 km away", "line3": "NA"}
]

# convert ee list to dataframe, ee_df
ee_df = pd.DataFrame(ee)


# save ee_output_df for ee data to CSV
ee_df.to_csv('data/processed/ee.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [36]:
# ee data check
print("\nFirst 10 rows of the output:")
print(ee_df.head(10))


First 10 rows of the output:
   begin_year  distance  eqMagnitude    severity           line1  \
0        1903       316          5.3    Moderate        MAY 1903   
1        1906       335          5.6       Large  SEPTEMBER 1906   
2        1907       335          5.1    Moderate   FEBRUARY 1907   
3        1908       497          7.0  Very Large   DECEMBER 1908   
4        1909       452          5.3    Moderate    OCTOBER 1909   
5        1911       652          4.3       Small    OCTOBER 1911   
6        1914       452          4.9       Small        MAY 1914   
7        1916       494          5.1    Moderate       JULY 1916   
8        1924       489          5.6       Large      MARCH 1924   
9        1926       451          5.3    Moderate     AUGUST 1926   

                                  line2                      line3  
0                     M5.3, 316 km away                    Palermo  
1                     M5.6, 335 km away                     Sicily  
2             

### HISTORICAL BURNT AREA & FIRE WEATHER INDEX


In [33]:
# generate historical burnt area & fire weather index data for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

fwi = [
      { "week": 1, "monthName": "Jan", "fwi": 36.03855628967285},
      { "week": 2, "monthName": "Jan", "fwi": 28.35186767578125},
      { "week": 3, "monthName": "Jan", "fwi": 34.48613758087156},
      { "week": 4, "monthName": "Jan", "fwi": 35.160119628906244},
      { "week": 5, "monthName": "Feb", "fwi": 41.2155460357666},
      { "week": 6, "monthName": "Feb", "fwi": 42.91906299591064},
      { "week": 7, "monthName": "Feb", "fwi": 40.35708732604981},
      { "week": 8, "monthName": "Feb", "fwi": 35.13482322692871},
      { "week": 9, "monthName": "Feb", "fwi": 43.7328405380249},
      { "week": 10, "monthName": "Mar", "fwi": 55.629536437988264},
      { "week": 11, "monthName": "Mar", "fwi": 51.963145637512206},
      { "week": 12, "monthName": "Mar", "fwi": 48.64410858154295},
      { "week": 13, "monthName": "Mar", "fwi": 48.45940856933592},
      { "week": 14, "monthName": "Apr", "fwi": 42.525428390502924},
      { "week": 15, "monthName": "Apr", "fwi": 48.989934921264634},
      { "week": 16, "monthName": "Apr", "fwi": 47.94815864562989},
      { "week": 17, "monthName": "Apr", "fwi": 59.693392562866215},
      { "week": 18, "monthName": "May", "fwi": 53.26485347747803},
      { "week": 19, "monthName": "May", "fwi": 67.04015121459962},
      { "week": 20, "monthName": "May", "fwi": 66.2925880432129},
      { "week": 21, "monthName": "May", "fwi": 63.51103172302246},
      { "week": 22, "monthName": "May", "fwi": 57.59551124572754},
      { "week": 23, "monthName": "Jun", "fwi": 66.97727813720704},
      { "week": 24, "monthName": "Jun", "fwi": 75.7531303405762},
      { "week": 25, "monthName": "Jun", "fwi": 80.30134506225586},
      { "week": 26, "monthName": "Jun", "fwi": 90.69736862182619},
      { "week": 27, "monthName": "Jul", "fwi": 75.26012268066407},
      { "week": 28, "monthName": "Jul", "fwi": 95.59054870605469},
      { "week": 29, "monthName": "Jul", "fwi": 82.06852722167967},
      { "week": 30, "monthName": "Jul", "fwi": 81.8968620300293},
      { "week": 31, "monthName": "Aug", "fwi": 81.7047821044922},
      { "week": 32, "monthName": "Aug", "fwi": 81.58447265625001}, 
      { "week": 33, "monthName": "Aug", "fwi": 65.29224243164063},  
      { "week": 34, "monthName": "Aug", "fwi": 67.29769515991212},  
      { "week": 35, "monthName": "Aug", "fwi": 64.21281738281252},  
      { "week": 36, "monthName": "Sep", "fwi": 69.20558013916019},  
      { "week": 37, "monthName": "Sep", "fwi": 59.376176834106474},  
      { "week": 38, "monthName": "Sep", "fwi": 50.01441955566406},  
      { "week": 39, "monthName": "Sep", "fwi": 40.38814010620118 },  
      { "week": 40, "monthName": "Oct", "fwi": 48.369334793090815},  
      { "week": 41, "monthName": "Oct", "fwi": 43.82190437316895},  
      { "week": 42, "monthName": "Oct", "fwi": 37.03949813842773},  
      { "week": 43, "monthName": "Oct", "fwi": 50.04811096191406},  
      { "week": 44, "monthName": "Nov", "fwi": 47.38101158142093},  
      { "week": 45, "monthName": "Nov", "fwi": 37.50416679382325},  
      { "week": 46, "monthName": "Nov", "fwi": 29.76080322265625},  
      { "week": 47, "monthName": "Nov", "fwi": 36.063685607910124},  
      { "week": 48, "monthName": "Nov", "fwi": 34.42437210083008},  
      { "week": 49, "monthName": "Dec", "fwi": 32.008924865722626},  
      { "week": 50, "monthName": "Dec", "fwi": 33.579549407958986},  
      { "week": 51, "monthName": "Dec", "fwi": 31.927024841308594},  
      { "week": 52, "monthName": "Dec", "fwi": 34.5278169631958},  
      { "week": 53, "monthName": "Dec", "fwi": 31.089004516601562}  

]

# convert fwi list to dataframe, fwi_df
fwi_df = pd.DataFrame(fwi)

# create output CSV of fwi_df for plotting
fwi_output_df = pd.DataFrame({
    'week': fwi_df['week'],
    'monthName': fwi_df['monthName'],
    'fwi': fwi_df['fwi'].round(2),  # round the count to 2 decimal places
})

# save fwi_output_df for fwi data to CSV
fwi_output_df.to_csv('data/processed/fwi.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [34]:
# fwi data check
print("\nFirst 10 rows of the output:")
print(fwi_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(fwi_output_df)}")
print(f"Month names: {fwi_output_df['monthName'].unique()}")
print(f"fwi values: {fwi_output_df['fwi'].unique()}")


First 10 rows of the output:
   week monthName    fwi
0     1       Jan  36.04
1     2       Jan  28.35
2     3       Jan  34.49
3     4       Jan  35.16
4     5       Feb  41.22
5     6       Feb  42.92
6     7       Feb  40.36
7     8       Feb  35.13
8     9       Feb  43.73
9    10       Mar  55.63

Total number of records: 53
Month names: ['Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun' 'Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec']
fwi values: [36.04 28.35 34.49 35.16 41.22 42.92 40.36 35.13 43.73 55.63 51.96 48.64
 48.46 42.53 48.99 47.95 59.69 53.26 67.04 66.29 63.51 57.6  66.98 75.75
 80.3  90.7  75.26 95.59 82.07 81.9  81.7  81.58 65.29 67.3  64.21 69.21
 59.38 50.01 40.39 48.37 43.82 37.04 50.05 47.38 37.5  29.76 36.06 34.42
 32.01 33.58 31.93 34.53 31.09]
