In [46]:
import pandas as pd
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer
import miceforest as mf
import numpy as np

In [47]:
df_train = pd.read_csv("./dataset/train_preprocessed_nooutliers.csv")
df_train_id = df_train.pop("Id")
df_target = df_train.pop("CO2 Emissions(g/km)")
df_train

Unnamed: 0,Fuel Consumption City (L/100Km),Fuel Consumption Hwy (L/100Km),Fuel Consumption Comb (L/100Km),Make,Vehicle Class,Transmission,Fuel Type,Engine Size(L),Cylinders
0,8.640000,6.140000,7.514791,FOLD,PICKUP TRUCK - STANDARD,A6,X,3.5,6.0
1,27.270000,30.760000,28.840000,CHEVO,PICKUP TRUCK - STANDARD,A6,E,5.3,
2,1.700000,2.030000,,BMV,SUBCOMPACT,M6,Z,4.4,
3,15.337423,15.900000,,KIO,SUV - SMALL,AS6,X,,4.0
4,20.150000,6.000000,13.800000,BARUSU,MINICOMPACT,,Z,3.0,6.0
...,...,...,...,...,...,...,...,...,...
137136,33.710000,7.698229,22.026432,BMV,SUBCOMPACT,AS8,Z,,8.0
137137,6.489293,0.870000,3.961175,GONDA,STATION WAGON - SMALL,M6,X,1.5,4.0
137138,2.990000,8.380000,5.412550,NIRRAN,MID-SIZE,AV,X,1.8,4.0
137139,11.480000,0.221408,6.400000,TOYOTI,COMPACT,,X,1.8,4.0


In [48]:
df_test = pd.read_csv("./dataset/test_preprocessed.csv")
df_test_id = df_test.pop("Id")
df_test.rename(
    columns={
        "Fuel Consumption City": "Fuel Consumption City (L/100Km)",
        "Fuel Consumption Hwy": "Fuel Consumption Hwy (L/100Km)",
        "Fuel Consumption Comb": "Fuel Consumption Comb (L/100Km)",
    },
    inplace=True,
)
# Assuming df is your DataFrame
cols_to_move = [
    "Fuel Consumption City (L/100Km)",
    "Fuel Consumption Hwy (L/100Km)",
    "Fuel Consumption Comb (L/100Km)",
]

for col in reversed(cols_to_move):  # We reverse the list to keep the original order
    first_column = df_test.pop(col)  # Remove the column
    df_test.insert(0, col, first_column)  # Insert it in the first position

temp_df = df_test.pop("Transmission")
df_test.insert(5, "Transmission", temp_df)
temp_df = df_test.pop("Fuel Type")
df_test.insert(6, "Fuel Type", temp_df)

df_test

Unnamed: 0,Fuel Consumption City (L/100Km),Fuel Consumption Hwy (L/100Km),Fuel Consumption Comb (L/100Km),Make,Vehicle Class,Transmission,Fuel Type,Engine Size(L),Cylinders
0,15.625000,1.770000,9.393554,CADILUXE,MID-SIZE,AS8,X,3.6,
1,21.110000,14.380000,18.083183,FOLD,VAN - PASSENGER,AS6,X,3.7,6.0
2,24.501519,10.290747,18.107358,MITSU,SUV - SMALL,AV6,X,6.0,
3,16.800000,15.898251,16.420000,LECUS,MID-SIZE,AS8,Z,4.6,
4,41.630900,19.151254,31.520000,BMV,SUV - STANDARD,AS8,Z,4.4,8.0
...,...,...,...,...,...,...,...,...,...
58771,18.354841,20.900000,19.500000,KIO,SUV - SMALL,A6,X,3.3,
58772,6.189857,6.599736,6.373486,FOLD,,AS6,X,1.6,4.0
58773,14.492754,11.001618,12.923878,CHEVO,SUV - STANDARD,A6,X,3.5,
58774,8.421575,4.594035,6.700213,FOLD,,AS6,X,5.0,8.0


In [49]:
df_train.isna().sum()

Fuel Consumption City (L/100Km)     7210
Fuel Consumption Hwy (L/100Km)      7756
Fuel Consumption Comb (L/100Km)     7662
Make                                   0
Vehicle Class                       5132
Transmission                       10440
Fuel Type                           7419
Engine Size(L)                     26740
Cylinders                          21271
dtype: int64

In [50]:
df_test.isna().sum()

Fuel Consumption City (L/100Km)     2513
Fuel Consumption Hwy (L/100Km)      2598
Fuel Consumption Comb (L/100Km)     2765
Make                                   0
Vehicle Class                       2187
Transmission                        4505
Fuel Type                           3078
Engine Size(L)                     10618
Cylinders                           8886
dtype: int64

# Numerical Data Imputation

In [51]:
fuel_consumption_features = [
    "Fuel Consumption City (L/100Km)",
    "Fuel Consumption Hwy (L/100Km)",
    "Fuel Consumption Comb (L/100Km)",
    # "CO2 Emissions(g/km)",
]


engine_cylinders_features = [
    "Engine Size(L)",
    "Cylinders",
    "Fuel Consumption City (L/100Km)",
    "Fuel Consumption Hwy (L/100Km)",
    "Fuel Consumption Comb (L/100Km)",
]

# ["Engine Size(L)", "Cylinders", "CO2 Emissions(g/km)", "Make", "Vehicle Class"] # with category features

# Mice Forest

In [52]:
def impute_data(df_train, df_test, column_features, n_iterations=10, random_state=42):
    # Select the features to impute
    df_train_to_impute = df_train[column_features]
    df_test_to_impute = df_test[column_features]

    # Create and fit the ImputationKernel on the training data
    kds = mf.ImputationKernel(
        df_train_to_impute, save_all_iterations=True, random_state=random_state
    )
    kds.mice(n_iterations)

    # Impute the training data
    df_train_imputed = kds.complete_data()

    # Impute the test data
    kds.impute_new_data(new_data=df_test_to_impute)

    # Replace the original columns in df_train and df_test with the imputed ones
    df_train.loc[:, column_features] = df_train_imputed
    df_test.loc[:, column_features] = df_test_to_impute

    return df_train, df_test

In [53]:
df_train_imputed, df_test_imputed = impute_data(
    df_train, df_test, fuel_consumption_features
)

df_train_imputed, df_test_imputed = impute_data(
    df_train_imputed, df_test_imputed, engine_cylinders_features
)

In [54]:
df_train_imputed

Unnamed: 0,Fuel Consumption City (L/100Km),Fuel Consumption Hwy (L/100Km),Fuel Consumption Comb (L/100Km),Make,Vehicle Class,Transmission,Fuel Type,Engine Size(L),Cylinders
0,8.640000,6.140000,7.514791,FOLD,PICKUP TRUCK - STANDARD,A6,X,3.5,6.0
1,27.270000,30.760000,28.840000,CHEVO,PICKUP TRUCK - STANDARD,A6,E,5.3,8.0
2,1.700000,2.030000,1.825151,BMV,SUBCOMPACT,M6,Z,4.4,8.0
3,15.337423,15.900000,15.546238,KIO,SUV - SMALL,AS6,X,2.5,4.0
4,20.150000,6.000000,13.800000,BARUSU,MINICOMPACT,,Z,3.0,6.0
...,...,...,...,...,...,...,...,...,...
137136,33.710000,7.698229,22.026432,BMV,SUBCOMPACT,AS8,Z,6.2,8.0
137137,6.489293,0.870000,3.961175,GONDA,STATION WAGON - SMALL,M6,X,1.5,4.0
137138,2.990000,8.380000,5.412550,NIRRAN,MID-SIZE,AV,X,1.8,4.0
137139,11.480000,0.221408,6.400000,TOYOTI,COMPACT,,X,1.8,4.0


In [55]:
df_test_imputed

Unnamed: 0,Fuel Consumption City (L/100Km),Fuel Consumption Hwy (L/100Km),Fuel Consumption Comb (L/100Km),Make,Vehicle Class,Transmission,Fuel Type,Engine Size(L),Cylinders
0,15.625000,1.770000,9.393554,CADILUXE,MID-SIZE,AS8,X,3.6,
1,21.110000,14.380000,18.083183,FOLD,VAN - PASSENGER,AS6,X,3.7,6.0
2,24.501519,10.290747,18.107358,MITSU,SUV - SMALL,AV6,X,6.0,
3,16.800000,15.898251,16.420000,LECUS,MID-SIZE,AS8,Z,4.6,
4,41.630900,19.151254,31.520000,BMV,SUV - STANDARD,AS8,Z,4.4,8.0
...,...,...,...,...,...,...,...,...,...
58771,18.354841,20.900000,19.500000,KIO,SUV - SMALL,A6,X,3.3,
58772,6.189857,6.599736,6.373486,FOLD,,AS6,X,1.6,4.0
58773,14.492754,11.001618,12.923878,CHEVO,SUV - STANDARD,A6,X,3.5,
58774,8.421575,4.594035,6.700213,FOLD,,AS6,X,5.0,8.0


# Categorical Imputation

In [56]:
# take features to be used for imputation

categorical_features = [
        "Make",
        "Vehicle Class",
        "Transmission",
        "Fuel Type",
        "Engine Size(L)",
        "Cylinders",
    ]

In [63]:
for col in df_train_imputed.columns:
    if df_train_imputed[col].dtype == "object":
        df_train_imputed[col] = df_train_imputed[col].astype("category")

for col in df_test_imputed.columns:
    if df_test_imputed[col].dtype == "object":
        df_test_imputed[col] = df_test_imputed[col].astype("category")

In [64]:
df_test_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58776 entries, 0 to 58775
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   Fuel Consumption City (L/100Km)  56263 non-null  float64 
 1   Fuel Consumption Hwy (L/100Km)   56178 non-null  float64 
 2   Fuel Consumption Comb (L/100Km)  56011 non-null  float64 
 3   Make                             58776 non-null  category
 4   Vehicle Class                    56589 non-null  category
 5   Transmission                     54271 non-null  category
 6   Fuel Type                        55698 non-null  category
 7   Engine Size(L)                   48158 non-null  float64 
 8   Cylinders                        49890 non-null  float64 
dtypes: category(4), float64(5)
memory usage: 2.5 MB


In [65]:
df_test_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58776 entries, 0 to 58775
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   Fuel Consumption City (L/100Km)  56263 non-null  float64 
 1   Fuel Consumption Hwy (L/100Km)   56178 non-null  float64 
 2   Fuel Consumption Comb (L/100Km)  56011 non-null  float64 
 3   Make                             58776 non-null  category
 4   Vehicle Class                    56589 non-null  category
 5   Transmission                     54271 non-null  category
 6   Fuel Type                        55698 non-null  category
 7   Engine Size(L)                   48158 non-null  float64 
 8   Cylinders                        49890 non-null  float64 
dtypes: category(4), float64(5)
memory usage: 2.5 MB


In [66]:
# UserWarning: [Transmission,Fuel Type] have very rare categories, it is a good idea to group these, or set the min_data_in_leaf parameter to prevent lightgbm from outputting 0.0 probabilities.
# Might try the suggestion above

df_train_imputed, df_test_imputed = impute_data(
    df_train_imputed, df_test_imputed, categorical_features
)

  warn(


In [159]:
df_train_imputed.insert(0, "Id", df_train_id)
df_test_imputed.insert(0, 'Id', df_test_id)

df_train_imputed.to_csv(
    "./dataset/train_preprocessed_imputed_unpreprocessed_nooutliers.csv", index=False
)

df_test_imputed.to_csv(
    "./dataset/test_preprocessed_imputed_unpreprocessed_nooutliers.csv", index=False
)