In [21]:
import pandas as pd

In [22]:
df_train = pd.read_csv("../dataset/train_cleaned_outliers.csv")
df_train_id = df_train.pop("Id")
df_train_target = df_train.pop("CO2 Emissions(g/km)")

df_test = pd.read_csv("../dataset/test_cleaned_outliers.csv")
df_test_id = df_test.pop("Id")

In [23]:
for col in df_train.columns:
    if df_train[col].dtype == "object":
        df_train[col] = df_train[col].astype("category")

for col in df_test.columns:
    if df_test[col].dtype == "object":
        df_test[col] = df_test[col].astype("category")

In [24]:
fuel_consumption_features = [
    "Fuel Consumption City",
    "Fuel Consumption Hwy",
    "Fuel Consumption Comb",
    # "CO2 Emissions(g/km)",
]


engine_cylinders_features = [
    "Engine Size(L)",
    "Cylinders",
    # "Vehicle Class",
    # "Vehicle Type",
    "Make",
    "Fuel Consumption City",
    "Fuel Consumption Hwy",
    "Fuel Consumption Comb",
    # "CO2 Emissions(g/km)"
]

# ["Engine Size(L)", "Cylinders", "CO2 Emissions(g/km)", "Make", "Vehicle Class"] # with category features

In [25]:
import miceforest as mf


def train_imputation_kernel(df, column_features, n_iterations=10, random_state=42):
    # Select the features to impute
    df_to_impute = df[column_features]

    # Create and fit the ImputationKernel
    kds = mf.ImputationKernel(
        df_to_impute, save_all_iterations=True, random_state=random_state, train_nonmissing=True
    )
    kds.mice(n_iterations)

    return kds


def impute_data(df, column_features, kds=None, n_iterations=10, random_state=42):
    # Select the features to impute
    df_to_impute = df[column_features]

    if kds is None:
        # If no existing ImputationKernel is provided, train a new one
        kds = train_imputation_kernel(df, column_features, n_iterations=n_iterations, random_state=random_state)

    # Impute the data
    new_data = kds.impute_new_data(new_data=df_to_impute)
    df_imputed = new_data.complete_data()

    # Replace the original columns in df with the imputed ones
    df.loc[:, column_features] = df_imputed

    return df


def impute_train_test_data(
    df_train, df_test, column_features, n_iterations=10, random_state=42
):
    # Select the features to impute
    df_train_to_impute = df_train[column_features]
    df_test_to_impute = df_test[column_features]

    # Create and fit the ImputationKernel on the training data
    kds = mf.ImputationKernel(
        df_train_to_impute, save_all_iterations=True, random_state=random_state
    )
    kds.mice(n_iterations)

    # Impute the training data
    df_train_imputed = kds.complete_data()

    # Impute the test data
    new_data = kds.impute_new_data(new_data=df_test_to_impute)
    df_test_imputed = new_data.complete_data()

    # Replace the original columns in df_train and df_test with the imputed ones
    df_train.loc[:, column_features] = df_train_imputed
    df_test.loc[:, column_features] = df_test_imputed

    return df_train, df_test

# pipeline 3

In [26]:
fuel_consumption_features = [
    "Fuel Consumption City",
    "Fuel Consumption Hwy",
    "Fuel Consumption Comb",
    # "CO2 Emissions(g/km)",
]


engine_cylinders_features = [
    "Engine Size(L)",
    "Cylinders",
    # "Vehicle Class",
    # "Vehicle Type",
    "Make",
    "Fuel Consumption City",
    "Fuel Consumption Hwy",
    "Fuel Consumption Comb",
    # "CO2 Emissions(g/km)",
]

# ["Engine Size(L)", "Cylinders", "CO2 Emissions(g/km)", "Make", "Vehicle Class"] # with category features

categorical_features = [
    "Vehicle Class",
    "Transmission",
    "Fuel Type",
    "Make",
    "Engine Size(L)",
    "Cylinders",
    # "CO2 Emissions(g/km)",
]

In [27]:
df_train = pd.concat([df_train, df_train_target], axis=1)
df_train = impute_data(df_train, fuel_consumption_features + ["CO2 Emissions(g/km)"])
df_train_target = df_train.pop("CO2 Emissions(g/km)")
kds = train_imputation_kernel(df_train, fuel_consumption_features)
df_test = impute_data(df_test, fuel_consumption_features, kds)

In [28]:
df_train = pd.concat([df_train, df_train_target], axis=1)
df_train = impute_data(df_train, engine_cylinders_features + ["CO2 Emissions(g/km)"])
df_train_target = df_train.pop("CO2 Emissions(g/km)")
kds = train_imputation_kernel(df_train, engine_cylinders_features)
df_test = impute_data(df_test, engine_cylinders_features, kds)

In [29]:
df_train = pd.concat([df_train, df_train_target], axis=1)
df_train = impute_data(df_train, categorical_features + ["CO2 Emissions(g/km)"])
df_train_target = df_train.pop("CO2 Emissions(g/km)")
kds = train_imputation_kernel(df_train, categorical_features)
df_test = impute_data(df_test, categorical_features, kds)

  warn(
  warn(


In [None]:
for col in df_train.columns:
    if df_train[col].dtype == "category":
        df_train[col] = df_train[col].astype("object")

for col in df_test.columns:
    if df_test[col].dtype == "category":
        df_test[col] = df_test[col].astype("object")

import functions as func

df_train = func.process_transmission(df_train)
df_test = func.process_transmission(df_test)

df_train = func.group_vehicle_classes(df_train)
df_test = func.group_vehicle_classes(df_test)

df_train = func.group_vehicle_types(df_train)
df_test = func.group_vehicle_types(df_test)

df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)

df_train.insert(0, "Id", df_train_id)
df_train = pd.concat([df_train, df_train_target], axis=1)

df_test.insert(0, "Id", df_test_id)

df_train.to_csv("../dataset/train_cleaned_outliers_imputed_3.csv", index=False)
df_test.to_csv("../dataset/test_cleaned_outliers_imputed_3.csv", index=False)

# pipeline 2

In [15]:
df_for_training = pd.concat([df_train.copy(), df_test.copy()], axis=0)
df_for_training.dropna(inplace=True)
df_for_training = df_for_training.apply(
    lambda x: x.cat.remove_unused_categories() if x.dtype.name == "category" else x
)
kds = train_imputation_kernel(df_for_training, df_for_training.columns.to_list())

  warn(


In [21]:
df_train = df_train.astype(df_for_training.dtypes)
df_test = df_test.astype(df_for_training.dtypes)

In [22]:
df_train = impute_data(df_train, df_for_training.columns.to_list(), kds)
df_test = impute_data(df_test, df_for_training.columns.to_list(), kds)

In [27]:
for col in df_train.columns:
    if df_train[col].dtype == "category":
        df_train[col] = df_train[col].astype("object")

for col in df_test.columns:
    if df_test[col].dtype == "category":
        df_test[col] = df_test[col].astype("object")

In [28]:
import functions as func

df_train = func.process_transmission(df_train)
df_test = func.process_transmission(df_test)

df_train = func.group_vehicle_classes(df_train)
df_test = func.group_vehicle_classes(df_test)

df_train = func.group_vehicle_types(df_train)
df_test = func.group_vehicle_types(df_test)

In [None]:
df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)

In [None]:
df_train.insert(0, "Id", df_train_id)
df_train = pd.concat([df_train, df_train_target], axis=1)

df_test.insert(0, "Id", df_test_id)

df_train.to_csv("../dataset/train_cleaned_outliers_imputed_2.csv", index=False)
df_test.to_csv("../dataset/test_cleaned_outliers_imputed_2.csv", index=False)

# pipeline 1

In [69]:
df_train, df_test = impute_train_test_data(
    df_train, df_test, fuel_consumption_features
)

In [70]:
df_train = pd.concat([df_train, df_train_target], axis=1)
df_train = impute_data(df_train, engine_cylinders_features + ["CO2 Emissions(g/km)"])
df_train_target = df_train.pop("CO2 Emissions(g/km)")
kds = train_imputation_kernel(df_train, engine_cylinders_features)
df_test = impute_data(df_test, engine_cylinders_features, kds)

In [71]:
categorical_features = [
    "Make",
    "Vehicle Class",
    "Transmission",
    "Fuel Type",
    "Engine Size(L)",
    "Cylinders",
]

In [72]:
df_train, df_test = impute_train_test_data(df_train, df_test, categorical_features)

  warn(


In [73]:
df_train.isna().sum()

Make                        0
Vehicle Class               0
Engine Size(L)              0
Cylinders                   0
Transmission                0
Fuel Type                   0
Fuel Consumption City       0
Fuel Consumption Hwy        0
Fuel Consumption Comb       0
Transmission_Type        1483
Gears                    4646
Vehicle Class General    2051
Vehicle Type                0
is_outlier                  0
dtype: int64

In [74]:
df_test.isna().sum()

Make                        0
Vehicle Class               0
Engine Size(L)              0
Cylinders                   0
Transmission                0
Fuel Type                   0
Fuel Consumption City       0
Fuel Consumption Hwy        0
Fuel Consumption Comb       0
Transmission_Type         613
Gears                    1987
Vehicle Class General     863
Vehicle Type                0
is_outlier                  0
dtype: int64

In [78]:
for col in df_train.columns:
    if df_train[col].dtype == "category":
        df_train[col] = df_train[col].astype("object")

for col in df_test.columns:
    if df_test[col].dtype == "category":
        df_test[col] = df_test[col].astype("object")

In [82]:
import functions as func

df_train = func.process_transmission(df_train)
df_test = func.process_transmission(df_test)

df_train = func.group_vehicle_classes(df_train)
df_test = func.group_vehicle_classes(df_test)

df_train = func.group_vehicle_types(df_train)
df_test = func.group_vehicle_types(df_test)

In [85]:
df_train = df_train.fillna(-1)
df_test = df_test.fillna(-1)

In [94]:
df_train.insert(0, "Id", df_train_id)
df_train = pd.concat([df_train, df_train_target], axis=1)

df_test.insert(0, "Id", df_test_id)

df_train.to_csv("../dataset/train_cleaned_outliers_imputed.csv", index=False)
df_test.to_csv("../dataset/test_cleaned_outliers_imputed.csv", index=False)