### CLEANING DATA

In [8]:
import pandas as pd

# Load the data
data = pd.read_csv('dataset/prepared_data.csv', sep=";")


In [9]:
# Detecting outliers using IQR
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

outliers_gdp = detect_outliers(data, 'gdp')
outliers_co2 = detect_outliers(data, 'co2')
appended_data = pd.concat([outliers_gdp, outliers_co2])

In [10]:
# REMOVING OUTLIERS
data = data[~data['country'].isin(appended_data['country'])]

#CLEANING 0 VALUES
columns_with_zeros = data.columns[(data == 0).any()]

for column in columns_with_zeros:
    # Calculate the mean of the column, excluding zero values
    mean_value = data[data[column] != 0][column].mean()
    # Replace zero values with the calculated mean
    data[column] = data[column].replace(0, mean_value)


# CLEANING MISSING VALUES
# Specify columns to exclude from filling NaNs
exclude_columns = ['country', 'year', 'iso_code']

# Fill NaNs with the mean for columns not in the exclude list
columns_to_fill = [col for col in data.columns if col not in exclude_columns]
data[columns_to_fill] = data[columns_to_fill].apply(lambda x: x.fillna(x.mean()), axis=0)

# MERGE IN WORLD BANK CLASSSIFICATION
world_bank = pd.read_csv('dataset/world-bank-income-classification.csv')
world_bank = world_bank[['country', 'year', 'classification']]

data = pd.merge(data, world_bank, on=['country', 'year'])

# Save the cleaned data to a new CSV file
data.to_csv('dataset/cleaned_data.csv', sep=";", index=False)