## Data Pre-Processing

In [2]:
import pandas as pd

### Checking and Handling Missing Values

In [4]:
# loads the dataset
df = pd.read_csv("Agrofood_co2_emission.csv")

# identifies all columns with missing values
columns_to_fill = df.columns[df.isnull().any()].tolist()

# prints missing value counts before filling
print("missing values before handling:\n")
print(df[columns_to_fill].isnull().sum())

missing values before handling:

Savanna fires                        31
Forest fires                         93
Crop Residues                      1389
Forestland                          493
Net Forest conversion               493
Food Household Consumption          473
IPPU                                743
Manure applied to Soils             928
Manure Management                   928
Fires in humid tropical forests     155
On-farm energy use                  956
dtype: int64


In [5]:
# calculates median values for numeric columns only
median_values = df[columns_to_fill].median(numeric_only=True)

# fills missing values with the corresponding medians
df[columns_to_fill] = df[columns_to_fill].fillna(median_values)

# prints missing value counts after filling
print("\nmissing values after handling:\n")
print(df[columns_to_fill].isnull().sum())


missing values after handling:

Savanna fires                      0
Forest fires                       0
Crop Residues                      0
Forestland                         0
Net Forest conversion              0
Food Household Consumption         0
IPPU                               0
Manure applied to Soils            0
Manure Management                  0
Fires in humid tropical forests    0
On-farm energy use                 0
dtype: int64


### Verifying Formatting of Continuous and Categorical Features

In [7]:
# converts 'Area' to categorical type
df['Area'] = df['Area'].astype('category')

# converts 'Year' to categorical type
df['Year'] = df['Year'].astype('category')

# prints updated data types for all columns
print("\ndata types after conversion:\n")
print(df.dtypes)


data types after conversion:

Area                               category
Year                               category
Savanna fires                       float64
Forest fires                        float64
Crop Residues                       float64
Rice Cultivation                    float64
Drained organic soils (CO2)         float64
Pesticides Manufacturing            float64
Food Transport                      float64
Forestland                          float64
Net Forest conversion               float64
Food Household Consumption          float64
Food Retail                         float64
On-farm Electricity Use             float64
Food Packaging                      float64
Agrifood Systems Waste Disposal     float64
Food Processing                     float64
Fertilizers Manufacturing           float64
IPPU                                float64
Manure applied to Soils             float64
Manure left on Pasture              float64
Manure Management                   float64
F

In [8]:
# checks number of unique values in each column to confirm which ones are categorical candidates
print("\nnumber of unique values per column:\n")
print(df.nunique().sort_values())


number of unique values per column:

Year                                 31
Fires in organic soils              126
Area                                236
Net Forest conversion               442
Forestland                          605
Pesticides Manufacturing           1310
Fires in humid tropical forests    1736
Drained organic soils (CO2)        2147
Forest fires                       2962
Fertilizers Manufacturing          2973
Food Processing                    3542
Rice Cultivation                   3635
Food Packaging                     3704
Savanna fires                      3747
On-farm Electricity Use            5322
Crop Residues                      5359
On-farm energy use                 5558
Manure left on Pasture             5922
Manure applied to Soils            5927
Manure Management                  5931
IPPU                               6027
Average Temperature °C             6139
Food Household Consumption         6359
Agrifood Systems Waste Disposal    6419
Fo

### Checking and Handling Duplicates and Outliers

In [10]:
# checks for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"\nnumber of duplicate rows: {duplicate_count}")

# drops duplicate rows if any
df = df.drop_duplicates()



number of duplicate rows: 0


In [11]:
# selects only numeric columns
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# creates a dictionary to store outlier counts per column
outlier_counts = {}

# loops through numeric columns and detects outliers using IQR
for col in numeric_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_counts[col] = len(outliers)

# prints number of outliers per column
print("\nnumber of outliers per numeric column (using IQR method):\n")
for col, count in sorted(outlier_counts.items(), key=lambda x: -x[1]):
    print(f"{col}: {count}")


number of outliers per numeric column (using IQR method):

Fires in humid tropical forests: 1607
Savanna fires: 1439
Forest fires: 1412
Forestland: 1364
Drained organic soils (CO2): 1361
Food Packaging: 1358
Net Forest conversion: 1271
total_emission: 1142
Food Household Consumption: 1129
IPPU: 1077
Manure Management: 1051
Crop Residues: 1048
On-farm energy use: 1037
Agrifood Systems Waste Disposal: 1028
Manure applied to Soils: 1010
Urban population: 1001
Pesticides Manufacturing: 946
Rice Cultivation: 923
On-farm Electricity Use: 917
Manure left on Pasture: 911
Food Transport: 890
Food Retail: 871
Total Population - Female: 858
Food Processing: 851
Total Population - Male: 826
Fertilizers Manufacturing: 778
Rural population: 758
Fires in organic soils: 309
Average Temperature °C: 155


### Create additional categorical features

#### Temperature Category
Based on the Average Temperature °C column, we divided observations into three categories using the 33rd and 66th percentiles:

- Cold: Temperature below the 33rd percentile

- Moderate: Temperature between the 33rd and 66th percentiles

- Hot: Temperature above the 66th percentile

#### Population Size
We calculated total population as the sum of Total Population - Male and Total Population - Female, then categorized countries as:

- Small: Population below the 33rd percentile

- Medium: Population between the 33rd and 66th percentiles

- Large: Population above the 66th percentile

In [13]:
# creates total population column
df['Total Population'] = df['Total Population - Male'] + df['Total Population - Female']

# creates temperature category using quantiles
temp_quantiles = df['Average Temperature °C'].quantile([0.33, 0.66])
df['Temperature Category'] = pd.cut(
    df['Average Temperature °C'],
    bins=[-float('inf'), temp_quantiles[0.33], temp_quantiles[0.66], float('inf')],
    labels=['Cold', 'Moderate', 'Hot']
)

# creates population size category using quantiles
pop_quantiles = df['Total Population'].quantile([0.33, 0.66])
df['Population Size'] = pd.cut(
    df['Total Population'],
    bins=[-float('inf'), pop_quantiles[0.33], pop_quantiles[0.66], float('inf')],
    labels=['Small', 'Medium', 'Large']
)

# prints category distribution for inspection
print("\ntemperature category distribution:\n")
print(df['Temperature Category'].value_counts())


temperature category distribution:

Temperature Category
Hot         2368
Cold        2299
Moderate    2298
Name: count, dtype: int64


In [14]:
print("\npopulation size category distribution:\n")
print(df['Population Size'].value_counts())


population size category distribution:

Population Size
Large     2368
Small     2299
Medium    2298
Name: count, dtype: int64


In [26]:
# saves the cleaned dataset to a CSV file
#df.to_csv("agrofood_co2_emission_cleaned.csv", index=False)