### Primary Data Cleaning

This file is to clean and process primary data which involves following steps

1. Standardizing country names
2. Clean the data
3. Converting wide format to long format
4. Merging Parts, Passenger Vehicles and Trucks
5. Writing final data to a csv to be merged with secondary data

In [37]:
# import all the modules needed
import pandas as pd
import country_converter as coco

In [38]:
pv_export_df = pd.read_csv("./../../data/raw/primary/New_PV_Countries_Export_Value.csv",
                           encoding='utf-16',header=0, sep="\t")
pv_export_df.head(5)

Unnamed: 0,Partner,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Canada,17859000199,12652204453,16995719535,17902530002,18776536663,20348261635,21865353045,21023836427,22206354040,23537578145,22743027693,23932510142,18595463497,23706809603,26845856308,25301461937,23242066773
1,Germany,7271930736,4359799043,3633810812,5146003729,5857684381,4745975309,5272829307,5910717531,6356555766,5721504301,5416613521,6651557562,6498904046,7258458199,7205388022,8496591494,7538936258
2,Mexico,4053831183,1987600598,2842508188,3174225028,3607341712,3631801923,3540262939,3084575996,3604195477,3449264042,3254667672,3227885847,2088545024,2874156753,3606171154,4839937174,5273695629
3,China,681596584,754526827,2921382708,4306360159,4959652931,7531612003,9942175219,8264903454,8211990469,9526202379,6238289558,6984084476,5957317167,6278802297,5024574204,6063085338,4775689891
4,United Arab Emirates,1799478615,550584794,1045120624,1192574437,1818622363,2141941944,2162718008,2122690664,1605293038,1366056839,1244937033,1154707351,811436264,1070647419,1687900533,2166286395,2160723518


In [39]:
def standardize_country(countries):
    """
    Converts a list of country names or codes to their standardized short names.

    Parameters
    ----------
    countries : list or str
        A list of country names or codes, or a single country name/code, to be standardized.

    Returns
    -------
    list or str
        The standardized short names of the input countries. 
        If a single country is provided, returns a string.
        If a list is provided, returns a list of strings.

    Notes
    -----
    This function uses the `coco` library to perform the conversion. 
    The output format is the short name of each country or `not found` 
    if country name/code is not recognized.
    """
    try:
        standardized_name = coco.convert(names=countries, to='name_short')
        if standardized_name == 'not found':
            return countries
        return standardized_name
    except Exception as e:
        print(f"Error converting country name {countries}: {e}")
        return countries

In [40]:
def clean_primary_data(df, country_col, value_col, cols_to_keep,
                       start_year=2008, end_year=2024, not_found_label='not found'):
    """
    Clean and preprocess a dataframe by standardizing country names, 
    filtering columns and rows, and melting into long format.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing data with country and year columns.
    country_col : str
        The name of the column in `df` containing country names to be standardized.
    value_col : str
        The name of the value column in the output DataFrame after melting.
    cols_to_keep : list of str
        The list of columns to keep before melting (e.g., standardized country and year columns).
    start_year : int, optional
        The first year to consider when dropping rows with missing data.
    end_year : int, optional
        The last year to consider when dropping rows with missing data.
    not_found_label : str, optional (default="not found")
        The label used to identify unrecognized countries.

    Returns
    -------
    pd.DataFrame
        A cleaned DataFrame with standardized country names, 
        a 'Year' column, and a column for data values.

    Notes
    -----
    Steps performed:
    1. Drop rows with no data between `start_year` and `end_year`.
    2. Standardize country names using `standardized_country`.
    3. Drop rows where the standardized name equals `not_found_label`.
    4. Retain only the specified columns (`cols_to_keep`).
    5. Melt the DataFrame into long format with `Year`, `standardized_country`, and values.
    """

    # Drop rows with no data in the given year range
    year_cols = [str(year) for year in range(start_year, end_year + 1)]
    df = df.dropna(subset=year_cols, how="all")

    # Standardize country names
    df['standardized_country'] = df[country_col].apply(standardize_country)

    # Drop rows where country name could not be standardized
    df = df[df['standardized_country'] != not_found_label]

    # Keep relevant columns
    df = df.loc[:, cols_to_keep]

    # Melt the dataframe to long format
    melted_df = df.melt(id_vars=['standardized_country', 'Category'],
                        var_name='year', value_name=value_col)

    return melted_df


In [41]:
def get_cols_to_keep(start_year, end_year,country_col='standardized_country',
                     category_col='Category'):
    """
    Generate a list of columns to keep for data cleaning.

    Parameters
    ----------
    start_year : int, optional
        The first year to include in the list of columns.
    end_year : int, optional
        The last year to include in the list of columns.
    string_cols : list of str, optional
        A list of string column names to include before the year columns.

    Returns
    -------
    list of str
        A list of column names including 'standardized_country' and years
        from `start_year` to `end_year`.
    """
    year_cols = [str(year) for year in range(start_year, end_year + 1)]
    cols_to_keep = [country_col, category_col] + year_cols
    return cols_to_keep



In [42]:
pv_export_df["Category"] = "Passenger Vehicles"
pv_export_df.head(5)

Unnamed: 0,Partner,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,Category
0,Canada,17859000199,12652204453,16995719535,17902530002,18776536663,20348261635,21865353045,21023836427,22206354040,23537578145,22743027693,23932510142,18595463497,23706809603,26845856308,25301461937,23242066773,Passenger Vehicles
1,Germany,7271930736,4359799043,3633810812,5146003729,5857684381,4745975309,5272829307,5910717531,6356555766,5721504301,5416613521,6651557562,6498904046,7258458199,7205388022,8496591494,7538936258,Passenger Vehicles
2,Mexico,4053831183,1987600598,2842508188,3174225028,3607341712,3631801923,3540262939,3084575996,3604195477,3449264042,3254667672,3227885847,2088545024,2874156753,3606171154,4839937174,5273695629,Passenger Vehicles
3,China,681596584,754526827,2921382708,4306360159,4959652931,7531612003,9942175219,8264903454,8211990469,9526202379,6238289558,6984084476,5957317167,6278802297,5024574204,6063085338,4775689891,Passenger Vehicles
4,United Arab Emirates,1799478615,550584794,1045120624,1192574437,1818622363,2141941944,2162718008,2122690664,1605293038,1366056839,1244937033,1154707351,811436264,1070647419,1687900533,2166286395,2160723518,Passenger Vehicles


In [43]:
required_cols = get_cols_to_keep(2008, 2024, 'standardized_country', 'Category')

cleaned_pv_export_df = clean_primary_data(
    pv_export_df,
    country_col='Partner',
    value_col='Export_value',
    # category_col='Category',
    cols_to_keep=required_cols
)

cleaned_pv_export_df.head(5)

Netherlands Antilles not found in regex
French S. Antarctic Terr not found in regex


Unnamed: 0,standardized_country,Category,year,Export_value
0,Canada,Passenger Vehicles,2008,17859000199
1,Germany,Passenger Vehicles,2008,7271930736
2,Mexico,Passenger Vehicles,2008,4053831183
3,China,Passenger Vehicles,2008,681596584
4,United Arab Emirates,Passenger Vehicles,2008,1799478615


## Similarly load all other export datasets into individual dataframes

In [44]:
trucks_export_df = pd.read_csv("./../../data/raw/primary/MediumHeavy_Export_Countries_Value.csv",
                               encoding='utf-16',header=0, sep="\t")
trucks_export_df.head(5)

Unnamed: 0,Partner,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,Canada,3804919203,2563237026,3970350074,5013353082,5557753481,5181614688,4998684617,4568516286,3960625475,4926195306,6239555595,6427563893,4498169550,5418459380,6567662285,7626994363,7476019988
1,Mexico,183709107,96483975,137243219,147360365,211011092,271544253,394054683,326924602,369550136,227108247,268254721,320500812,202235506,207795149,304330318,459197717,806845166
2,Australia,214093395,64854255,143821288,227811124,347639980,272875584,188736657,120182897,87600541,92942497,115485977,75428802,77403523,117370471,96648954,152683161,230509977
3,Ukraine,595900,278635,69052,114717,0,0,0,100243,704135,455140,434764,85152,211145,1409910,1272213,1757591,91807324
4,Israel,13833891,17717653,21323322,20667328,11039346,10025167,36893269,30213641,19284799,23806464,31877228,47485279,59502369,16481390,17251195,19894564,91349134


In [45]:
trucks_export_df["Category"] = "Trucks"
cleaned_trucks_export_df = clean_primary_data(
    trucks_export_df,
    country_col='Partner',
    value_col ='Export_value',
    # category_col='Category',
    cols_to_keep=required_cols
)

cleaned_trucks_export_df.head(5)


Netherlands Antilles not found in regex


Unnamed: 0,standardized_country,Category,year,Export_value
0,Canada,Trucks,2008,3804919203
1,Mexico,Trucks,2008,183709107
2,Australia,Trucks,2008,214093395
3,Ukraine,Trucks,2008,595900
4,Israel,Trucks,2008,13833891


In [46]:
parts_export_df = pd.read_csv("./../../data/raw/primary/Parts_Exports_Countries.csv",
                              encoding='utf-16',header=0, sep="\t")
parts_export_df["Category"] = "Parts"
cleaned_parts_export_df = clean_primary_data(
    parts_export_df,
    country_col='Partner',
    value_col='Export_value',
    # category_col='Category',
    cols_to_keep=required_cols
)
cleaned_parts_export_df.head(5)

French S. Antarctic Terr not found in regex
Netherlands Antilles not found in regex


Unnamed: 0,standardized_country,Category,year,Export_value
0,Mexico,Parts,2008,13971074278
1,Canada,Parts,2008,28258965666
2,Australia,Parts,2008,958541210
3,Germany,Parts,2008,1706706846
4,China,Parts,2008,913973328


#### Merge Export dataframes for passenger vehicles, trucks and parts

In [47]:
# Merge the three cleaned dataframes into one
merged_exports_df = pd.concat([
    cleaned_pv_export_df,
    cleaned_trucks_export_df,
    cleaned_parts_export_df
], ignore_index=True)

merged_exports_df.head()

Unnamed: 0,standardized_country,Category,year,Export_value
0,Canada,Passenger Vehicles,2008,17859000199
1,Germany,Passenger Vehicles,2008,7271930736
2,Mexico,Passenger Vehicles,2008,4053831183
3,China,Passenger Vehicles,2008,681596584
4,United Arab Emirates,Passenger Vehicles,2008,1799478615


In [48]:
# Check all unique categories in the merged dataframe
merged_exports_df['year'].unique()

array(['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015',
       '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
       '2024'], dtype=object)

#### Now load csv files for PV, trucks and parts imports.
#### Then merge the cleaned dataframes to get a single dataframe for imports value

In [49]:
# Load csv file for PV imports
pv_import_df = pd.read_csv("./../../data/raw/primary/New_PV_Countries_Import_Value.csv",
                           encoding='utf-16',header=0, sep="\t")

pv_import_df["Category"] = "Passenger Vehicles"

required_cols = get_cols_to_keep(2008, 2024, 'standardized_country', 'Category')

# Clean the PV imports data
cleaned_pv_import_df = clean_primary_data(
    pv_import_df,
    country_col='Partner',
    value_col='Import_value',
    # category_col='Category',
    cols_to_keep=required_cols
)

cleaned_pv_import_df.head(5)


Unnamed: 0,standardized_country,Category,year,Import_value
0,Mexico,Passenger Vehicles,2008,19795290562
1,Japan,Passenger Vehicles,2008,40975127629
2,South Korea,Passenger Vehicles,2008,7413089740
3,Canada,Passenger Vehicles,2008,33575452102
4,Germany,Passenger Vehicles,2008,18282476127


In [50]:
# Load csv file for parts imports
parts_import_df = pd.read_csv("./../../data/raw/primary/Parts_Imports_Countries.csv",
                              encoding='utf-16',header=0, sep="\t")
parts_import_df["Category"] = "Parts"

# Clean the parts imports data
cleaned_parts_import_df = clean_primary_data(
    parts_import_df,
    country_col='Partner',
    value_col='Import_value',
    # category_col='Category',
    cols_to_keep=required_cols
)

cleaned_parts_import_df.head(5)

French S. Antarctic Terr not found in regex
Netherlands Antilles not found in regex


Unnamed: 0,standardized_country,Category,year,Import_value
0,Mexico,Parts,2008,25654723740
1,Canada,Parts,2008,16126706233
2,China,Parts,2008,8877534735
3,Japan,Parts,2008,13157308988
4,South Korea,Parts,2008,3893417871


In [51]:
# Load csv file for truck imports
trucks_import_df = pd.read_csv("./../../data/raw/primary/MediumHeavy_Import_Countries_Value.csv",
                               encoding='utf-16', header=0, sep="\t")
trucks_import_df["Category"] = "Trucks"

# Clean the trucks imports data
cleaned_trucks_import_df = clean_primary_data(
    trucks_import_df,
    country_col='Partner',
    value_col='Import_value',
    # category_col='Category',
    cols_to_keep=required_cols
)

cleaned_trucks_import_df.head(5)

Unnamed: 0,standardized_country,Category,year,Import_value
0,Mexico,Trucks,2008,2248425964
1,Canada,Trucks,2008,2864371851
2,Japan,Trucks,2008,298780311
3,Germany,Trucks,2008,187825244
4,Türkiye,Trucks,2008,1386000


#### Merge Import dataframes for passenger vehicles, trucks and parts

In [52]:
# Merge the three cleaned import dataframes into one
merged_imports_df = pd.concat([
    cleaned_pv_import_df,
    cleaned_trucks_import_df,
    cleaned_parts_import_df
], ignore_index=True)

merged_imports_df.head()

Unnamed: 0,standardized_country,Category,year,Import_value
0,Mexico,Passenger Vehicles,2008,19795290562
1,Japan,Passenger Vehicles,2008,40975127629
2,South Korea,Passenger Vehicles,2008,7413089740
3,Canada,Passenger Vehicles,2008,33575452102
4,Germany,Passenger Vehicles,2008,18282476127


In [53]:
# Merge Export and Import dataframes into one, keeping only rows present in both (no NaN values)
primary_df = pd.merge(
    merged_imports_df,
    merged_exports_df,
    on=["standardized_country", "year", "Category"],
    how="inner",
    suffixes=("_import", "_export")
)

primary_df.head()

Unnamed: 0,standardized_country,Category,year,Import_value,Export_value
0,Mexico,Passenger Vehicles,2008,19795290562,4053831183
1,Japan,Passenger Vehicles,2008,40975127629,388126905
2,South Korea,Passenger Vehicles,2008,7413089740,228776236
3,Canada,Passenger Vehicles,2008,33575452102,17859000199
4,Germany,Passenger Vehicles,2008,18282476127,7271930736


In [54]:
primary_df.to_csv("./../../data/processed/cleaned_primary_trade_data.csv", index=False)