In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv("../dataset/train_cleaned_outliers_imputed.csv")
df_train_id = df_train.pop("Id")
df_train_target = df_train.pop("CO2 Emissions(g/km)")

df_test = pd.read_csv("../dataset/test_cleaned_outliers_imputed.csv")
df_test_id = df_test.pop("Id")

In [3]:
df_train

Unnamed: 0,Make,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City,Fuel Consumption Hwy,Fuel Consumption Comb,Transmission_Type,Gears,Vehicle Class General,Vehicle Type,is_outlier
0,MITSU,SUV - SMALL,1.5,4.0,AV8,X,11.904762,7.200000,9.800000,Automatic,8,SUV,SMALL,0
1,TOYOTI,PICKUP TRUCK - SMALL,4.0,6.0,A5,X,13.793103,9.700000,11.960000,Automatic,5,PICKUP TRUCK,SMALL,0
2,MATSUDA,COMPACT,2.0,4.0,AS6,X,10.204082,7.299251,8.894238,Automatic,6,COMPACT,COMPACT,0
3,CHEVO,VAN - PASSENGER,6.0,8.0,A6,X,17.300000,11.700000,14.780000,Automatic,6,VAN,PASSENGER,0
4,TOYOTI,COMPACT,1.8,4.0,M6,X,8.100000,7.899357,8.010000,Manual,6,COMPACT,COMPACT,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54932,CHEVO,SUBCOMPACT,6.2,8.0,AS10,Z,11.900000,8.802817,10.505341,Automatic,10,SUBCOMPACT,SUBCOMPACT,1
54933,CHEVO,SUBCOMPACT,3.6,6.0,M6,X,21.000000,10.600000,16.323011,Manual,6,SUBCOMPACT,SUBCOMPACT,0
54934,FOLD,TWO-SEATER,3.5,6.0,AM7,Z,18.200000,12.500000,15.630000,Automatic,7,TWO-SEATER,TWO-SEATER,0
54935,CHEVO,PICKUP TRUCK - STANDARD,6.2,8.0,A8,Z,18.300000,9.900000,14.520000,Automatic,8,PICKUP TRUCK,STANDARD,1


In [4]:
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder, OrdinalEncoder


def encode_categorical_features(
    df_train,
    df_test,
    categorical_features_onehot,
    categorical_features_binary,
    categorical_features_ordinal,
):
    df_train_categorical_one_hot = df_train[categorical_features_onehot]
    df_train_categorical_binary = df_train[categorical_features_binary]
    df_train_categorical_ordinal = df_train[categorical_features_ordinal]

    df_test_categorical_one_hot = df_test[categorical_features_onehot]
    df_test_categorical_binary = df_test[categorical_features_binary]
    df_test_categorical_ordinal = df_test[categorical_features_ordinal]

    encoder_onehot = OneHotEncoder(sparse_output=False)
    train_onehot_encoded_data = encoder_onehot.fit_transform(
        df_train_categorical_one_hot
    )
    test_onehot_encoded_data = encoder_onehot.transform(df_test_categorical_one_hot)

    # Convert numpy arrays to pandas DataFrames
    train_onehot_encoded_data = pd.DataFrame(
        train_onehot_encoded_data,
        columns=encoder_onehot.get_feature_names_out(categorical_features_onehot),
        index=df_train.index,
    )
    test_onehot_encoded_data = pd.DataFrame(
        test_onehot_encoded_data,
        columns=encoder_onehot.get_feature_names_out(categorical_features_onehot),
        index=df_test.index,
    )

    encoder_binary = BinaryEncoder(cols=categorical_features_binary)
    train_df_binary = encoder_binary.fit_transform(df_train_categorical_binary)
    test_df_binary = encoder_binary.transform(df_test_categorical_binary)

    encoder_ordinal = OrdinalEncoder(cols=categorical_features_ordinal)
    train_df_ordinal = encoder_ordinal.fit_transform(df_train_categorical_ordinal)
    test_df_ordinal = encoder_ordinal.transform(df_test_categorical_ordinal)

    # Merge the one-hot, binary and ordinal encoded dataframes with the original dataframes
    df_train = pd.concat(
        [
            df_train.drop(
                categorical_features_onehot
                + categorical_features_binary
                + categorical_features_ordinal,
                axis=1,
            ),
            train_onehot_encoded_data,
            train_df_binary,
            train_df_ordinal,
        ],
        axis=1,
    )
    df_test = pd.concat(
        [
            df_test.drop(
                categorical_features_onehot
                + categorical_features_binary
                + categorical_features_ordinal,
                axis=1,
            ),
            test_onehot_encoded_data,
            test_df_binary,
            test_df_ordinal,
        ],
        axis=1,
    )

    return df_train, df_test

In [5]:
df_train.info()

onehot_columns = ["Make", "Fuel Type", "Transmission_Type", "Vehicle Class General", "Gears", "is_outlier"]
binary_columns = ["Vehicle Class", "Transmission"]
ordinal_columns = ["Vehicle Type"]

numerical_columns = df_train.select_dtypes(include='float64').columns.tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54937 entries, 0 to 54936
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Make                   54937 non-null  object 
 1   Vehicle Class          54937 non-null  object 
 2   Engine Size(L)         54937 non-null  float64
 3   Cylinders              54937 non-null  float64
 4   Transmission           54937 non-null  object 
 5   Fuel Type              54937 non-null  object 
 6   Fuel Consumption City  54937 non-null  float64
 7   Fuel Consumption Hwy   54937 non-null  float64
 8   Fuel Consumption Comb  54937 non-null  float64
 9   Transmission_Type      54937 non-null  object 
 10  Gears                  54937 non-null  int64  
 11  Vehicle Class General  54937 non-null  object 
 12  Vehicle Type           54937 non-null  object 
 13  is_outlier             54937 non-null  int64  
dtypes: float64(5), int64(2), object(7)
memory usage: 5.9+ 

In [6]:
# df_train, df_test = encode_categorical_features(df_train, df_test, onehot_columns, binary_columns, ordinal_columns)

In [7]:
from sklearn.preprocessing import MinMaxScaler


def minmax_transform_dataframe(df_train, df_test, columns_to_transform):
    scaler = MinMaxScaler()

    transformed_data_train = scaler.fit_transform(df_train[columns_to_transform])
    transformed_df_train = pd.DataFrame(
        transformed_data_train, columns=columns_to_transform, index=df_train.index
    )
    df_train[columns_to_transform] = transformed_df_train

    transformed_data_test = scaler.transform(df_test[columns_to_transform])
    transformed_df_test = pd.DataFrame(
        transformed_data_test, columns=columns_to_transform, index=df_test.index
    )
    df_test[columns_to_transform] = transformed_df_test

    return df_train, df_test

In [8]:
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb

random_state = 42


model_lgbm = lgb.LGBMRegressor(random_state=random_state)
model_xgb = XGBRegressor(random_state=random_state)
model_adab = AdaBoostRegressor(random_state=random_state)


model_ensemble = StackingRegressor(
    estimators=[("xgb", model_xgb), ("lgbm", model_lgbm), ("adab", model_adab)]
)

In [9]:
import time

# from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

# Assuming 'models' is a list of your models
models = [model_lgbm, model_xgb, model_adab, model_ensemble]

# Create a list of your datasets
datasets = [
    df_train,
]

y = df_train_target

# Initialize a dictionary to hold your results
results = {}

# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=random_state)

# Loop over your models
for model in models:
    model_name = type(model).__name__
    results[model_name] = {}

    # Loop over your datasets
    for i, dataset in enumerate(datasets):
        results[model_name][f"dataset_{i+1}"] = {
            "rmse": [],
            "training_time": [],
        }

        # Perform cross-validation
        for train_index, test_index in kf.split(dataset):
            X_train, X_test = dataset.iloc[train_index], dataset.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Preprocess the data
            X_train, X_test = encode_categorical_features(
                X_train,
                X_test,
                onehot_columns,
                binary_columns,
                ordinal_columns,
            )

            X_train, X_test = minmax_transform_dataframe(
                X_train, X_test, numerical_columns
            )

            # Start the timer
            start_time = time.time()

            # Train the model
            model.fit(X_train, y_train)

            # End the timer
            end_time = time.time()

            # Calculate the training time
            training_time = end_time - start_time

            # Make predictions
            predictions = model.predict(X_test)

            # Evaluate the model
            mse = mean_squared_error(y_test, predictions)
            rmse = np.sqrt(mse)

            # Store the results
            results[model_name][f"dataset_{i+1}"]["rmse"].append(rmse)
            results[model_name][f"dataset_{i+1}"]["training_time"].append(training_time)

        # Calculate the mean RMSE and training time
        results[model_name][f"dataset_{i+1}"]["rmse"] = np.mean(
            results[model_name][f"dataset_{i+1}"]["rmse"]
        )
        results[model_name][f"dataset_{i+1}"]["training_time"] = np.mean(
            results[model_name][f"dataset_{i+1}"]["training_time"]
        )

        # Print the process
        print(
            f"Model: {model_name}, Dataset: {i+1}, Mean RMSE: {results[model_name][f'dataset_{i+1}']['rmse']:.4f}, Mean Training time: {results[model_name][f'dataset_{i+1}']['training_time']:.4f} seconds"
        )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 43949, number of used features: 65
[LightGBM] [Info] Start training from score 246.828438
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 43949, number of used features: 65
[LightGBM] [Info] Start training from score 246.482036
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [10]:
# Convert the nested dictionary into a pandas DataFrame
df_results = pd.concat({k: pd.DataFrame(v).T for k, v in results.items()}, axis=0)

# Reset the index and rename the columns for a cleaner look
df_results.reset_index(inplace=True)
df_results.columns = ["Model", "Dataset", "RMSE", "Training Time"]

df_results.to_csv("results_to_submit.csv")
df_sorted = df_results.sort_values(by="RMSE", ascending=True)
df_sorted

Unnamed: 0,Model,Dataset,RMSE,Training Time
3,StackingRegressor,dataset_1,19.722697,47.311763
1,XGBRegressor,dataset_1,19.812993,0.474668
0,LGBMRegressor,dataset_1,20.339575,0.738387
2,AdaBoostRegressor,dataset_1,32.541271,8.165611


# Use Test

In [11]:
import time
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Assuming 'models' is a list of your models
models = [model_lgbm, model_xgb, model_adab, model_ensemble]

# Create a list of your train datasets
train_datasets = [df_train]

# Create a list of your test datasets
test_datasets = [df_test]

y = df_train_target

# Initialize a dictionary to hold your results
results = {}

# Loop over your models
for model in models:
    model_name = type(model).__name__
    results[model_name] = {}

    # Loop over your datasets
    for i, (train_dataset, test_dataset) in enumerate(
        zip(train_datasets, test_datasets)
    ):
        results[model_name][f"dataset_{i+1}"] = {
            "predictions": [],
            "training_time": [],
        }

        X_train = train_dataset
        y_train = y

        X_test = test_dataset

        X_train, X_test = encode_categorical_features(
            X_train,
            X_test,
            onehot_columns,
            binary_columns,
            ordinal_columns,
        )

        X_train, X_test = minmax_transform_dataframe(X_train, X_test, numerical_columns)

        # Start the timer
        start_time = time.time()

        # Train the model
        model.fit(X_train, y_train)

        # End the timer
        end_time = time.time()

        # Calculate the training time
        training_time = end_time - start_time

        # Make predictions
        predictions = model.predict(X_test)

        # Store the results
        results[model_name][f"dataset_{i+1}"]["predictions"] = predictions
        results[model_name][f"dataset_{i+1}"]["training_time"] = training_time

        # Print the process
        print(
            f"Model: {model_name}, Dataset: {i+1}, Training time: {results[model_name][f'dataset_{i+1}']['training_time']:.4f} seconds"
        )

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 54937, number of used features: 65
[LightGBM] [Info] Start training from score 246.688680
Model: LGBMRegressor, Dataset: 1, Training time: 1.0980 seconds
Model: XGBRegressor, Dataset: 1, Training time: 0.9959 seconds
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008421 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 54937, number of used features: 65
[LightGBM] [Info] Start training from score 246.688680
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003048 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enou

In [12]:
# Flatten the results dictionary
flat_results = []
for model_name, datasets in results.items():
    for dataset_name, metrics in datasets.items():
        flat_results.append(
            {
                "model_name": model_name,
                "dataset_name": dataset_name,
                "predictions": metrics["predictions"],
                "training_time": metrics["training_time"],
            }
        )

# Convert the flattened results to a DataFrame
df_results_submit = pd.DataFrame(flat_results)
df_results_submit

Unnamed: 0,model_name,dataset_name,predictions,training_time
0,LGBMRegressor,dataset_1,"[342.9385418176665, 196.41416644463698, 206.71...",1.098003
1,XGBRegressor,dataset_1,"[340.5571, 198.10645, 210.54256, 226.43877, 22...",0.995932
2,StackingRegressor,dataset_1,"[341.21726802417356, 198.22696093600297, 209.6...",58.529468


In [13]:
stacking_regressor_predictions = results["StackingRegressor"]["dataset_1"][
    "predictions"
]
stacking_regressor_predictions

array([341.21726802, 198.22696094, 209.6497149 , ..., 235.4348188 ,
       228.40777772, 333.49953058])

In [14]:
df_sample_submission = pd.read_csv("../dataset/sample_submission.csv")
df_sample_submission["CO2 Emissions(g/km)"] = stacking_regressor_predictions
df_sample_submission

Unnamed: 0,Id,CO2 Emissions(g/km)
0,54938,341.217268
1,54939,198.226961
2,54940,209.649715
3,54941,230.567111
4,54942,225.848619
...,...,...
23540,78478,213.472628
23541,78479,170.312235
23542,78480,235.434819
23543,78481,228.407778


In [None]:
df_sample_submission.to_csv("../submit/submission_stacking_regressor.csv", index=False)