In [1]:
import pandas as pd
import numpy as np

# import the library of geopandas to make geojson file for making map
import geopandas as gpd

In [2]:
# Load the CSV files from raw data
# data of CO₂ emissions per capita Consumption based, tonnes per capita
per_emissions_df = pd.read_csv("data/raw_data/co2_pcap_cons.csv")

# data of total CO2 emission
total_emissions_df = pd.read_csv("data/raw_data/co2_cons.csv")

# data of population
population_df = pd.read_csv("data/raw_data/pop.csv")

# data of GDP
gdp_df = pd.read_csv("data/raw_data/total_gdp_us_inflation_adjusted.csv")

#Define the year range we are analysising 
start_year = 2000
end_year = 2019

#Generate the year columns using numpy
#Generate an array of years and convert to string
years = np.arange(start_year, end_year + 1).astype(str)

#Filiter the coloumn from 2000 to 2019
filtered_peremission_df = per_emissions_df.loc[:, per_emissions_df.columns.isin(np.append('country', years))]
filtered_totalemission_df = total_emissions_df.loc[:, total_emissions_df.columns.isin(np.append('country', years))]
filtered_population_df = population_df.loc[:, population_df.columns.isin(np.append('country', years))]
filtered_gdp_df = gdp_df.loc[:, gdp_df.columns.isin(np.append('country', years))]


#Melt each DataFrame to long format, it was wide format
new_per_emissions = filtered_peremission_df.melt(id_vars= ['country'], var_name ='year', value_name= 'CO2 emissions per capita')
new_total_emissions = filtered_totalemission_df.melt(id_vars=['country'], var_name='year', value_name='CO2 total emission')
new_population = filtered_population_df.melt(id_vars=['country'], var_name='year', value_name='population')
new_gdp = filtered_gdp_df.melt(id_vars =['country'], var_name='year', value_name ='GDP')

#Merge the long-format DataFrames
merged_df = new_per_emissions.merge(new_total_emissions, on=['country', 'year'], how='inner')
merged_df = merged_df.merge(new_population, on=['country', 'year'], how='inner')
merged_df = merged_df.merge(new_gdp, on=['country', 'year'], how= 'inner')

#Name need to change
name_mapping = {
    'UAE': 'United Arab Emirates',
    'Bahamas': 'The Bahamas',
    'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
    'Congo, Rep.': 'Republic of the Congo',
    'Czech Republic': 'Czechia',
    'UK': 'United Kingdom',
    'Hong Kong, China': 'Hong Kong S.A.R.',
    'Kyrgyz Republic': 'Kyrgyzstan',
    'Serbia': 'Republic of Serbia',
    'Slovak Republic': 'Slovakia',
    'Eswatini': 'eSwatini',
    'Tanzania': 'United Republic of Tanzania',
    'USA': 'United States of America',
    'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines'
}

# Standardizing names in merged_df
merged_df['country'] = merged_df['country'].replace(name_mapping)


#Export CO2 data to JSON
merged_df.to_json('data/geo_data/CO2_data.json', orient='records', indent=1)

# Imports Polygon and MultiPolygon from shapely for geometric operations.
from shapely.geometry import MultiPolygon, Polygon

# Load data from Natural Earth to gain geo information
gdf = gpd.read_file('data/geo_data/Natural Earth/ne_10m_admin_0_countries.shp')

# Rename for the merge
gdf.rename(columns={'ADMIN': 'country'}, inplace=True)

# Read data from CO2json
CO2_df = pd.read_json('data/geo_data/CO2_data.json')

# Merge the co2 data with geo data
merged_gdf = gdf.merge(CO2_df, on='country', how='inner')

# define function to simple the shape, to make small file, beacue the former file was too big
def simplify_geometry(geom, tolerance=0.05, preserve_topology=True):
    return geom.simplify(tolerance, preserve_topology)

# Apply the simplify_geometry function
merged_gdf['geometry'] = merged_gdf['geometry'].apply(simplify_geometry)

# Keep columns which I want, for making the file to be small
gdf_simplified = merged_gdf[['country', 'year', 'CO2 emissions per capita', 'CO2 total emission', 'population', 'GDP','geometry']]

gdf_simplified_df = gpd.GeoDataFrame(gdf_simplified)

# Ensure the 'year' column exists
if 'year' in gdf_simplified_df.columns:
    # Retrieve a list of unique years from 'year' column
    unique_years = gdf_simplified_df['year'].unique()
    # Start loop, Split an entire geojsonfile into separate geojson based on different years, from 2000 to 2019 
    for year in unique_years:
        # Filter each year to save in different DF
        year_gdf = gdf_simplified_df[gdf_simplified_df['year'] == year]
        
        # Export to GeoJson file
        file_name = f"data/geo_data/simplified_geo_data_{year}.geojson"
        year_gdf.to_file(file_name, driver='GeoJSON')