# Tunis, Tunisia - Data Cleaning
##### May 2025

Basic data cleaning for appropriate CSV preparation necessary for City Scan JavaScript plots

In [32]:
# standard library imports
import os
import sys

# add project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# third-party imports
import numpy as np
import pandas as pd

# change to project root directory
os.chdir('../')
print("directory changes")
print(f"current working directory is:", os.getcwd())

# local imports (after changing directory)
# from src. import 

directory changes
current working directory is: /Users/carolinecullinan/dev/wb


# 3 POPULATION AND DEMOGRAPHIC TRENDS

### POPULATION GROWTH & DENSITY

In [4]:
# create data range from 2000 to 2021
years = range(2000, 2022)

# generate population growth data for Tunis, Tunisia given scan-calculations.html file
# Note: the intention is to write the scan-calculations population growth chart data to a csv file
# and then read it into the notebook.  In order to do this, a closer look at the 1) City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/) and demographics.py call to Oxford API; and 2)scan-calculations.html file is necesessary.  Basically, where is the scan-calculations.html file population growth data coming from, and how can we get it into a csv file?
pop_growth = [
    1969032 , 1984750, 2000614, 2016605, 2035590, 2070274, 2105593, 2141514,
    2178094	, 2215198, 2252984, 2291413, 2330547, 2370243, 2410667, 2449753, 
    2489472, 2529836, 2570854, 2612428, 2654378, 2696439
]

# create dataframe from pop_growth list
df = pd.DataFrame({
    'year': years,
    'population': pop_growth
})

# save to csv (no index)
df.to_csv('data/processed/pop_growth.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


### POPULATION DISTRIBUTION BY AGE & SEX

In [9]:
# generate population distribution data by age and sex for Tunis, Tunisia given scan-calculations.html file
# Note: the intention is to write the scan-calculations population growth chart data to a csv file
# and then read it into the notebook.  In order to do this, a closer look at the 1) City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/) and demographics.py call to Oxford/WorldPop API; and 2)scan-calculations.html file is necesessary.  Basically, where is the scan-calculations.html file population growth data coming from, and how can we get it into a csv file?
pop_age_sex = [
    {"ageBracket": "0-4", "sex": "female", "count": 84255.76, "percentage": 0.037317174},
    {"ageBracket": "0-4", "sex": "male", "count": 90688.25, "percentage": 0.040166147},
    {"ageBracket": "5-9", "sex": "female", "count": 83410.27, "percentage": 0.036942703},
    {"ageBracket": "5-9", "sex": "male", "count": 90365.87, "percentage": 0.040023362},
    {"ageBracket": "10-14", "sex": "female", "count": 68791.52, "percentage": 0.030468006},
    {"ageBracket": "10-14", "sex": "male", "count": 72744.49, "percentage": 0.032218791},
    {"ageBracket": "15-19", "sex": "female", "count": 62009.60, "percentage": 0.027464272},
    {"ageBracket": "15-19", "sex": "male", "count": 65126.86, "percentage": 0.028844915},
    {"ageBracket": "20-24", "sex": "female", "count": 74367.26, "percentage": 0.032937522},
    {"ageBracket": "20-24", "sex": "male", "count": 75255.84, "percentage": 0.033331078},
    {"ageBracket": "25-29", "sex": "female", "count": 94742.76, "percentage": 0.041961902},
    {"ageBracket": "25-29", "sex": "male", "count": 90570.88, "percentage": 0.040114164},
    {"ageBracket": "30-34", "sex": "female", "count": 106160.84, "percentage": 0.047019010},
    {"ageBracket": "30-34", "sex": "male", "count": 103540.29, "percentage": 0.045858363},
    {"ageBracket": "35-39", "sex": "female", "count": 95786.56, "percentage": 0.042424208},
    {"ageBracket": "35-39", "sex": "male", "count": 94503.85, "percentage": 0.041856091},
    {"ageBracket": "40-44", "sex": "female", "count": 83400.61, "percentage": 0.036938427},
    {"ageBracket": "40-44", "sex": "male", "count": 84275.39, "percentage": 0.037325869},
    {"ageBracket": "45-49", "sex": "female", "count": 73773.45, "percentage": 0.032674520},
    {"ageBracket": "45-49", "sex": "male", "count": 71356.75, "percentage": 0.031604158},
    {"ageBracket": "50-54", "sex": "female", "count": 75568.57, "percentage": 0.033469589},
    {"ageBracket": "50-54", "sex": "male", "count": 75557.95, "percentage": 0.033464882},
    {"ageBracket": "55-59", "sex": "female", "count": 65577.59, "percentage": 0.029044548},
    {"ageBracket": "55-59", "sex": "male", "count": 68573.22, "percentage": 0.030371323},
    {"ageBracket": "60-64", "sex": "female", "count": 54934.92, "percentage": 0.024330870},
    {"ageBracket": "60-64", "sex": "male", "count": 58307.57, "percentage": 0.025824630},
    {"ageBracket": "65-69", "sex": "female", "count": 44470.28, "percentage": 0.019696046},
    {"ageBracket": "65-69", "sex": "male", "count": 42850.35, "percentage": 0.018978570},
    {"ageBracket": "70-74", "sex": "female", "count": 24648.85, "percentage": 0.010917065},
    {"ageBracket": "70-74", "sex": "male", "count": 22636.52, "percentage": 0.010025793},
    {"ageBracket": "75-79", "sex": "female", "count": 15277.18, "percentage": 0.006766316},
    {"ageBracket": "75-79", "sex": "male", "count": 13950.16, "percentage": 0.006178577},
    {"ageBracket": "80+", "sex": "female", "count": 16808.83, "percentage": 0.007444692},
    {"ageBracket": "80+", "sex": "male", "count": 13538.88, "percentage": 0.005996419},
]

# convert pop_age_sex list to dataframe, df
pop_age_sex_df = pd.DataFrame(pop_age_sex)

# create output CSV of df for plotting
pop_age_sex_output_df = pd.DataFrame({
    'ageBracket': pop_age_sex_df['ageBracket'],
    'sex': pop_age_sex_df['sex'],
    'count': pop_age_sex_df['count'].round(2),  # round the count to 2 decimal places
    'percentage': pop_age_sex_df['percentage'] * 100  # convert to percentage (multiply by 100)
})

# sort by age bracket and sex to ensure proper ordering
age_order = ["0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", 
             "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", 
             "65-69", "70-74", "75-79", "80+"]

pop_age_sex_output_df['ageBracket'] = pd.Categorical(pop_age_sex_output_df['ageBracket'], categories=age_order, ordered=True)
pop_age_sex_output_df = pop_age_sex_output_df.sort_values(['ageBracket', 'sex'])

# save output_df for pop_age_sex data to CSV
pop_age_sex_output_df.to_csv('data/processed/pop_age_sex.csv', index=False)

print("csv file created successfully.")



csv file created successfully.


In [10]:
# pop_age_sex data check
print("\nFirst 10 rows of the output:")
print(pop_age_sex_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(pop_age_sex_output_df)}")
print(f"Age brackets: {pop_age_sex_output_df['ageBracket'].unique()}")
print(f"Sex categories: {pop_age_sex_output_df['sex'].unique()}")

# verify pop_age_sex data
total_percentage = output_df['percentage'].sum()
print(f"\nTotal percentage: {total_percentage:.2f}% (should be ~100%)")


First 10 rows of the output:
  ageBracket     sex     count  percentage
0        0-4  female  84255.76    3.731717
1        0-4    male  90688.25    4.016615
2        5-9  female  83410.27    3.694270
3        5-9    male  90365.87    4.002336
4      10-14  female  68791.52    3.046801
5      10-14    male  72744.49    3.221879
6      15-19  female  62009.60    2.746427
7      15-19    male  65126.86    2.884491
8      20-24  female  74367.26    3.293752
9      20-24    male  75255.84    3.333108

Total number of records: 34
Age brackets: ['0-4', '5-9', '10-14', '15-19', '20-24', ..., '60-64', '65-69', '70-74', '75-79', '80+']
Length: 17
Categories (17, object): ['0-4' < '5-9' < '10-14' < '15-19' ... '65-69' < '70-74' < '75-79' < '80+']
Sex categories: ['female' 'male']

Total percentage: 100.00% (should be ~100%)


# 5 BUILT FORM

### URBAN EXTENT AND CHANGE

In [None]:
# generate urban extent and change data for Tunis, Tunisia given "tabular" output from City Scan GCP process
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)
uba = [
      { "year": 1, "yearName": "1985", "uba": 166.08101814331798},
      { "year": 2, "yearName": "1986", "uba": 172.44786586480038},
      { "year": 3, "yearName": "1987", "uba": 185.11900630951914},
      { "year": 4, "yearName": "1988", "uba": 202.44611527369096},
      { "year": 5, "yearName": "1989", "uba": 209.3484337801595},
      { "year": 6, "yearName": "1990", "uba": 218.8655512133127},
      { "year": 7, "yearName": "1991", "uba": 226.97080233605266},
      { "year": 8, "yearName": "1992", "uba": 231.32775796388964},
      { "year": 9, "yearName": "1993", "uba": 235.31000915223282},
      { "year": 10, "yearName": "1994", "uba": 239.59690318203425},
      { "year": 11, "yearName": "1995", "uba": 244.28727695052265},
      { "year": 12, "yearName": "1996", "uba": 249.05396781687122},
      { "year": 13, "yearName": "1997", "uba": 253.29457114797057},
      { "year": 14, "yearName": "1998", "uba": 256.0263479213753},
      { "year": 15, "yearName": "1999", "uba": 258.9113844404829},
      { "year": 16, "yearName": "2000", "uba": 261.66818321318607},
      { "year": 17, "yearName": "2001", "uba": 264.4919158340125},
      { "year": 18, "yearName": "2002", "uba": 267.0854460612935},
      { "year": 19, "yearName": "2003", "uba": 270.3870988687197},
      { "year": 20, "yearName": "2004", "uba": 276.21722470525106},
      { "year": 21, "yearName": "2005", "uba": 281.41742170944474},
      { "year": 22, "yearName": "2006", "uba": 288.76825955333743},
      { "year": 23, "yearName": "2007", "uba": 294.66531923799204},
      { "year": 24, "yearName": "2008", "uba": 301.9654875333054},
      { "year": 25, "yearName": "2009", "uba": 310.2853023000293},
      { "year": 26, "yearName": "2010", "uba": 317.42908309972756},
      { "year": 27, "yearName": "2011", "uba": 322.2602056142696},
      { "year": 28, "yearName": "2012", "uba": 327.013134381004},
      { "year": 29, "yearName": "2013", "uba": 332.55550722560355},
      { "year": 30, "yearName": "2014", "uba": 337.01380195059915},
      { "year": 31, "yearName": "2015", "uba": 341.3507399789974}
]

# convert uba list to dataframe, df
uba_df = pd.DataFrame(uba)

# create output CSV of df for plotting
uba_output_df = pd.DataFrame({
    'year': uba_df['year'],
    'yearName': uba_df['yearName'],
    'uba': uba_df['uba'].round(2),  # round the count to 2 decimal places
})

# save output_df for uba data to CSV
uba_output_df.to_csv('data/processed/uba.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [22]:
# uba data check
print("\nFirst 10 rows of the output:")
print(uba_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(uba_output_df)}")
print(f"Year names: {uba_output_df['yearName'].unique()}")
print(f"UBA values: {uba_output_df['uba'].unique()}")


First 10 rows of the output:
   year yearName     uba
0     1     1985  166.08
1     2     1986  172.45
2     3     1987  185.12
3     4     1988  202.45
4     5     1989  209.35
5     6     1990  218.87
6     7     1991  226.97
7     8     1992  231.33
8     9     1993  235.31
9    10     1994  239.60

Total number of records: 31
Year names: ['1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015']
UBA values: [166.08 172.45 185.12 202.45 209.35 218.87 226.97 231.33 235.31 239.6
 244.29 249.05 253.29 256.03 258.91 261.67 264.49 267.09 270.39 276.22
 281.42 288.77 294.67 301.97 310.29 317.43 322.26 327.01 332.56 337.01
 341.35]


# 5 CLIMATE CONDITIONS

### PHOTOVOLTAIC POWER POTENTIAL


In [30]:
# generate photovoltaic power potential data (i.e., seasonal availa bility of solar energy, plotting the "max" value ) for Tunis, Tunisia
# Note: City Scan GitHub (https://github.com/rosemaryturtle/city-scan-automation/)

## NOTE: ov data is not available for Tunis, Tunisia (current data is just a placeholder)
pv = [
      { "month": 1, "monthName": "Jan", "max": 3.38, "min": 2.96, "mean": 3.09},
      { "month": 2, "monthName": "Feb", "max": 3.39, "min": 3.71, "mean": 3.98},
      { "month": 3, "monthName": "Mar", "max": 4.62, "min": 4.4, "mean": 4.54},
      { "month": 4, "monthName": "Apr", "max": 4.93, "min": 4.66, "mean": 4.83},
      { "month": 5, "monthName": "May", "max": 5.00, "min": 4.76, "mean": 4.92},
      { "month": 6, "monthName": "Jun", "max": 5.35, "min": 5.16, "mean": 5.27},
      { "month": 7, "monthName": "Jul", "max": 5.37, "min": 5.2, "mean": 5.29},
      { "month": 8, "monthName": "Aug", "max": 5.2, "min": 5.01, "mean": 5.12},
      { "month": 9, "monthName": "Sep", "max": 4.72, "min": 4.51, "mean": 4.63},
      { "month": 10, "monthName": "Oct", "max": 4.26, "min": 3.96, "mean": 4.17},
      { "month": 11, "monthName": "Nov", "max": 3.37, "min": 2.98, "mean": 3.29},
      { "month": 12, "monthName": "Dec", "max": 3.19, "min": 2.77, "mean": 3.11}
]

# convert pv list to dataframe, df
pv_df = pd.DataFrame(pv)

# create output CSV of df for plotting
pv_output_df = pd.DataFrame({
    'month': pv_df['month'],
    'monthName': pv_df['monthName'],
    'maxPv': pv_df['max'].round(2),  # round the count to 2 decimal places
})

# save output_df for pv data to CSV
pv_output_df.to_csv('data/processed/pv.csv', index=False)

print("csv file created successfully.")

csv file created successfully.


In [31]:
# pv data check
print("\nFirst 10 rows of the output:")
print(pv_output_df.head(10))

# summary statistics
print(f"\nTotal number of records: {len(pv_output_df)}")
print(f"Month names: {pv_output_df['monthName'].unique()}")
print(f"PV values: {pv_output_df['maxPv'].unique()}")


First 10 rows of the output:
   month monthName  maxPv
0      1       Jan   3.38
1      2       Feb   3.39
2      3       Mar   4.62
3      4       Apr   4.93
4      5       May   5.00
5      6       Jun   5.35
6      7       Jul   5.37
7      8       Aug   5.20
8      9       Sep   4.72
9     10       Oct   4.26

Total number of records: 12
Month names: ['Jan' 'Feb' 'Mar' 'Apr' 'May' 'Jun' 'Jul' 'Aug' 'Sep' 'Oct' 'Nov' 'Dec']
PV values: [3.38 3.39 4.62 4.93 5.   5.35 5.37 5.2  4.72 4.26 3.37 3.19]
