In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import pprint

import gc

from scipy.stats.mstats import winsorize

import matplotlib.pyplot as plt
import missingno as msno

import seaborn as sns
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from mpl_toolkits.mplot3d import Axes3D

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.metrics import mean_squared_error, make_scorer, classification_report

from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
pp = pprint.PrettyPrinter(indent=2)

# Importing Data

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s3e20/train.csv')
df

In [None]:
df = df.drop('ID_LAT_LON_YEAR_WEEK', axis=1)

# Exploratory Data Analysis

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Create a scatter mapbox plot
fig = px.scatter_mapbox(
    df,
    lat="latitude",
    lon="longitude",
    size="emission",
    color="emission",
    color_continuous_scale='Viridis',
    size_max=20,
    zoom=6
)

# Set mapbox style
fig.update_layout(mapbox_style="open-street-map")

# Update the layout
fig.update_layout(
    title="Emission Data Map Visualization",
    margin={"r": 0, "t": 30, "l": 0, "b": 0},
)

# Show the plot
fig.show()


In [None]:
fig = px.scatter_3d(df, x='latitude', y='longitude', z='emission',
              color='emission')
fig.show()

In [None]:
plt.figure(figsize=(10, 6))


sns.lineplot(
    data = df[['year',  'week_no', 'emission']].groupby(['year', 'week_no'])['emission'].mean().reset_index(), 
    x = 'week_no', 
    y = 'emission',
    hue='year',
    palette=["red", "cyan", "purple"], 
    errorbar = None
)
    
plt.title('Yearly Emission Over Week', fontsize = 20, fontweight = 'bold')
plt.show()

In [None]:
# Top 20 correlated features to the target
top20_corrs = abs(df.corr()['emission']).sort_values(ascending = False).head(20)

corr = df[list(top20_corrs.index)].corr()
plt.figure(figsize = (15, 10))
sns.heatmap(corr, cmap='seismic', annot = True, center = 0)
plt.title('Top 20 features Correlation', fontsize = 20, color = 'black')
plt.show()

# Eliminating Columns with High Null Values

In [None]:
msno.matrix(df)

**Some of the columns are nearly entirely empty. So, let's filter cols with missing values greater than 40%**

In [None]:
def remove_high_missing_vals_cols(df):
    '''Filtering cols with higher than 40% missing missing values'''
    
    threshold = 40
    
    cols_missing_vals_in_percent = dict((df.isnull().sum() / df.shape[0]) * 100) # converting missing values into percentage
    
    to_filter_cols = []
    
    for key, value in cols_missing_vals_in_percent.items():
        
        if value < threshold:
            to_filter_cols.append(key)
    
    df = df[ to_filter_cols ]
    
    return df, to_filter_cols

In [None]:
df, filtered_columns = remove_high_missing_vals_cols(df)
df

**7 columns were removed.**

# Independent Features Correlation

In [None]:
corr = df.loc[:, 'SulphurDioxide_SO2_column_number_density':'Cloud_solar_zenith_angle'].corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(18, 12))
sns.heatmap(corr, mask=mask, cmap='seismic', annot=False, center=0)
plt.title('Features Correlation matrix', fontsize=20, color='black')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

# Removing Highly Correlated Independent Features

<span style='color:red'><b>Reducing dimensionality is crucial in tackling the curse of dimensionality, a phenomenon where datasets with a high number of features or dimensions suffer from increased computational complexity and decreased predictive power. As the number of features grows, data becomes sparse, leading to inefficient storage, processing, and analysis. Additionally, the curse of dimensionality can cause overfitting in machine learning models due to limited data points in high-dimensional spaces. By reducing dimensionality through techniques like feature selection or extraction, we retain the most relevant information while discarding noise and redundant features. This not only enhances computational efficiency but also improves model generalization by mitigating the risk of overfitting. Ultimately, dimensionality reduction aids in maintaining a balance between the richness of data representation and the challenges posed by the curse of dimensionality.</b><span>

In [None]:
def filter_corr_independent_features(dataframe: pd.DataFrame, exclude_features: list, threshold: float, dependent_feature=None, how='corr_with_target'):
    '''
    dependent_feature=None mean it is X
    
    how='corr_with_target', 'max_non_null'
    '''
    excluded_df = dataframe[exclude_features] 
    dependent_feature_series = dataframe[dependent_feature]
    df = dataframe.drop(columns=excluded_df, axis=1)
    df = df.drop(dependent_feature, axis=1)
    to_remove_features = []
    cols = df.columns
    for row_name, row_series in df.corr().iterrows():
        for col_index, corr in enumerate(row_series):
            if (row_name != cols[col_index]) and (row_name not in to_remove_features) and (cols[col_index] not in to_remove_features):
                if corr > threshold:
                    feature1_name, feature2_name = row_name, cols[col_index] 
                    feature1_series, feature2_series = df[feature1_name], df[feature2_name]
#                     print(feature1_name, feature2_name)
                    if abs(feature1_series.corr(dependent_feature_series)) < abs(feature2_series.corr(dependent_feature_series)):
                        if feature1_name not in to_remove_features:
                            to_remove_features.append(feature1_name)
                    else:
                        if feature2_name not in to_remove_features:
                            to_remove_features.append(feature2_name)
    
    print(f"{len(to_remove_features)} features removed.")
    filtered_df = df.drop(to_remove_features, axis=1)
    
    return pd.concat([excluded_df, filtered_df, dependent_feature_series], axis=1)

In [None]:
df = filter_corr_independent_features(
        dataframe = df, 
        exclude_features = ['latitude', 'longitude', 'year', 'week_no'], 
        threshold = 0.8, 
        dependent_feature = 'emission'
)

In [None]:
df

In [None]:
print('Current Columns:\n')
pp.pprint(list(df.columns))

# Handling Outliers

In [None]:
print("Before Handling Outliers:\n")
pp.pprint(dict(df.skew()))

In [None]:
def winsorize_with_pandas(s, limits):
    """
    s : pd.Series
        Series to winsorize
    limits : tuple of float
        Tuple of the percentages to cut on each side of the array, 
        with respect to the number of unmasked data, as floats between 0. and 1
    """
    return s.clip(lower=s.quantile(limits[0], interpolation='lower'), 
                  upper=s.quantile(limits[1], interpolation='higher'))

# List of columns to Winsorize
columns_to_winsorize = df.columns

# Apply Winsorization to each specified column separately
for col in columns_to_winsorize:
    df[col] = winsorize_with_pandas(df[col], (0.05, 0.95))

pp.pprint(dict(df.skew()))

**All features seems alright except emission.**

In [None]:
plt.hist(df['emission'])
plt.title('Emission histogram')
plt.show()

In [None]:
# df['emission'][ df['emission'] > 3000 ]

# Handling Missing Values

In [None]:
msno.matrix(df)

<span style='color:red'><b>If the skewness is between -0.5 & 0.5, the data are nearly symmetrical. If the skewness is between -1 & -0.5 (negative skewed) or between 0.5 & 1(positive skewed). Based one this, if a column is skewed (left or right), I will replae the missing values with median and for columns with is nearly symmetrical I'll replace the missing values with mean.</b><span>

In [None]:
def missing_values_imputer(dataframe):
    df = dataframe.copy()
    
    symmetrical_cols_counter = 0
    skewed_cols_counter = 0
    categorical_cols_counter = 0
    
    for col in df.columns:
        
        if df[col].isnull().sum() != 0:
            
            if pd.api.types.is_numeric_dtype(df[col]): #Checking if numeric col
                skewness = df[col].skew()

                if -0.5 <= skewness <= 0.5:  # Symmetrical case
                    mean = df[col].mean()
                    df[col] = df[col].fillna(mean) # Replacing with mean
                    symmetrical_cols_counter += 1

                else:  # Negative or positive skewed case
                    median = df[col].median()
                    df[col] = df[col].fillna(median) # Replacing with median
                    skewed_cols_counter += 1
            
            else: #For categorical col
                mode = df[col].mode().values[0]
                df[col] = df[col].fillna(mode) # Replacing with mode
                categorical_cols_counter += 1
                
    print(f'{symmetrical_cols_counter} symmetrical cols were replaced with mean.')
    print(f'{skewed_cols_counter} skewed cols were replaced with median.')
    print(f'{categorical_cols_counter} categorical cols were replaced with mode.')
    
    return df

In [None]:
df = missing_values_imputer(df)

# X, y Split

In [None]:
y = df['emission']
X = df.drop('emission', axis=1)

In [None]:
del df
gc.collect()

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

# Standard Scaling

In [None]:
# StandardScaler = StandardScaler()
# StandardScaler.fit(X_train)
# X_train_scaled = StandardScaler.transform(X_train)

# Principal Component Analysis

In [None]:
# n_components = 2
# pca = PCA(n_components=n_components)
# pca.fit(X_train_scaled)
# X_pca = pca.transform(X_train_scaled)

In [None]:
# pca.explained_variance_ratio_

In [None]:
# X_pca.shape

In [None]:
# plt.scatter(X_pca[:, 0], X_pca[:, 1])
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.title('PCA Scatter Plot')
# plt.show()

# Creating base models & selecting

In [None]:
def calculate_rmse(y_true, y_pred):
    """
    Calculate Root Mean Squared Error (RMSE) using scikit-learn's mean_squared_error function.

    Parameters:
    y_true : array-like or list
        The actual target values.
    y_pred : array-like or list
        The predicted target values.

    Returns:
    float
        The Root Mean Squared Error (RMSE) value.
    """
    # Convert inputs to NumPy arrays to ensure compatibility with mean_squared_error function
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate the mean squared error
    mse = mean_squared_error(y_true, y_pred)

    # Calculate the RMSE by taking the square root of the mean squared error
    rmse = np.sqrt(mse)

    return rmse

In [None]:
'''Linear Regression'''

LinearRegression_pipeline = Pipeline([
    ('quantile_transformer', QuantileTransformer()),
    ('min_max_scaler', MinMaxScaler()),
    ('linear_regression', LinearRegression())
])

# training the pipeline on the training set
LinearRegression_pipeline.fit(X_train, np.array(y_train))

# predicting on the testing set
y_pred = LinearRegression_pipeline.predict(X_test)

# evaluating the performance
rmse_result = calculate_rmse(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse_result)

In [None]:
'''XGBRegressor'''

# specifing the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',  # specify the objective function
    'eval_metric': 'rmse',  # specify the evaluation metric
    'tree_method': 'gpu_hist',  # use GPU to build trees
    'gpu_id': 0  # specify the GPU device to use
}

# creating an XGBoost regressor
xgb_model = xgb.XGBRegressor(**params)

# fitting the model on the training data
xgb_model.fit(X_train, np.array(y_train))

# predicting on the test data
y_pred = xgb_model.predict(X_test)

# evaluating the performance
rmse_result = calculate_rmse(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse_result)

In [None]:
'''Decision Tree'''

# Creating a decision tree regressor
tree = DecisionTreeRegressor(random_state=42)

# Fitting the model on the training data
tree.fit(X_train, np.array(y_train))

# Making predictions on the test data
y_pred = tree.predict(X_test)

# evaluating the performance
rmse_result = calculate_rmse(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse_result)

In [None]:
'''Neural network'''

NN_pipeline = Pipeline([
    ('transformer', QuantileTransformer()),
    ('min_max_scaler', MinMaxScaler()),
    ('estimator', MLPRegressor())
])

# Fitting the pipeline on the training data
NN_pipeline.fit(X_train, np.array(y_train))

# Predict on the testing data
y_pred = NN_pipeline.predict(X_test)

# evaluating the performance
rmse_result = calculate_rmse(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse_result)

# XBGRegressor - Hyperparameter Tuning

In [None]:
# # Define the XGBoost Regressor
# xgb_model = xgb.XGBRegressor(
#     objective ='reg:squarederror',
#     eval_metric = "rmse",
#     tree_method = 'gpu_hist',  # use GPU to build trees
#     gpu_id = 0  # specify the GPU device to use
# )

# param_grid = {
#     'max_depth': [10, 15, 16, 20],
#     'min_child_weight': [6,7,8,10],
    
#     'learning_rate': [0.01, 0.03, 0.1],
#     'gamma': [0, 0.2],
#     'subsample': [0.2, 0.4, 0.7, 1.0],
#     'colsample_bytree': [0.2, 0.4, 0.7, 1.0],
# }

# grid_search = GridSearchCV(
#     estimator=xgb_model,
#     param_grid=param_grid, 
#     cv=3, 
#     scoring='neg_mean_squared_error', 
#     verbose=4)

# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_
# best_params

In [None]:
# best_params['objective'] = 'reg:squarederror'
# best_params['eval_metric'] = 'rmse'
# best_params['tree_method'] = 'gpu_hist'
# best_params['gpu_id'] = 0
# best_params

In [None]:
# best_estimator = xgb.XGBRegressor(
#     best_params
# )
# best_estimator.fit(X_train, y_train)

# # Make predictions using the best estimator
# y_pred = best_estimator.predict(X_test)

# # evaluating the performance
# rmse_result = calculate_rmse(y_test, y_pred)

# print("Root Mean Squared Error (RMSE):", rmse_result)

In [None]:
'''Previous best model'''

params = {
    'colsample_bytree': 1.0,
    'gamma': 0,
    'learning_rate': 0.1,
    'max_depth': 15,
    'min_child_weight': 10,
    'subsample': 1.0,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
#     'alpha': 0.2,  # L1 regularization term (Lasso)
#     'lambda': 0.1  # L2 regularization term (Ridge)
}

best_estimator = xgb.XGBRegressor(**params)

best_estimator.fit(X_train, y_train)

In [None]:
# Make predictions using the best estimator
y_pred_ = best_estimator.predict(X_train)

# evaluating the performance
rmse_result_ = calculate_rmse(y_train, y_pred_)

print("Train set Root Mean Squared Error (RMSE):", rmse_result_)

In [None]:
# Make predictions using the best estimator
y_pred = best_estimator.predict(X_test)

# evaluating the performance
rmse_result = calculate_rmse(y_test, y_pred)

print("Test set Root Mean Squared Error (RMSE):", rmse_result)

In [None]:
# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(y_train.values.reshape(-1), y_pred_.reshape(-1), c='blue', alpha=0.5)
plt.xlabel('True Valueas')
plt.ylabel('Predicted Values')
plt.title('Train set - Scatter Plot of True vs. Predicted Values')
plt.grid(True)
plt.show()

In [None]:
# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test.values.reshape(-1), y_pred.reshape(-1), c='blue', alpha=0.5)
plt.xlabel('True Valueas')
plt.ylabel('Predicted Values')
plt.title('Test set - Scatter Plot of True vs. Predicted Values')
plt.grid(True)
plt.show()

# Submission

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s3e20/test.csv')
test_df

In [None]:
def df_transformer_WithWinsorize(dataframe):
    
    def winsorize_with_pandas(s, limits):
        """
        s : pd.Series
            Series to winsorize
        limits : tuple of float
            Tuple of the percentages to cut on each side of the array, 
            with respect to the number of unmasked data, as floats between 0. and 1
        """
        return s.clip(lower=s.quantile(limits[0], interpolation='lower'), 
                      upper=s.quantile(limits[1], interpolation='higher'))

    
    def missing_values_imputer(dataframe):
        df = dataframe.copy()

        symmetrical_cols_counter = 0
        skewed_cols_counter = 0
        categorical_cols_counter = 0

        for col in df.columns:

            if df[col].isnull().sum() != 0:

                if pd.api.types.is_numeric_dtype(df[col]): #Checking if numeric col
                    skewness = df[col].skew()

                    if -0.5 <= skewness <= 0.5:  # Symmetrical case
                        mean = df[col].mean()
                        df[col] = df[col].fillna(mean) # Replacing with mean
                        symmetrical_cols_counter += 1

                    else:  # Negative or positive skewed case
                        median = df[col].median()
                        df[col] = df[col].fillna(median) # Replacing with median
                        skewed_cols_counter += 1

                else: #For categorical col
                    mode = df[col].mode().values[0]
                    df[col] = df[col].fillna(mode) # Replacing with mode
                    categorical_cols_counter += 1

        return df

    df = dataframe.copy()
    
    df = df.drop('ID_LAT_LON_YEAR_WEEK', axis=1)
    
    filtered_cols = [ 'latitude',
                      'longitude',
                      'year',
                      'week_no',
                      'SulphurDioxide_SO2_column_number_density_amf',
                      'SulphurDioxide_cloud_fraction',
                      'SulphurDioxide_SO2_column_number_density_15km',
                      'CarbonMonoxide_CO_column_number_density',
                      'CarbonMonoxide_H2O_column_number_density',
                      'CarbonMonoxide_cloud_height',
                      'CarbonMonoxide_sensor_altitude',
                      'CarbonMonoxide_sensor_azimuth_angle',
                      'CarbonMonoxide_sensor_zenith_angle',
                      'NitrogenDioxide_NO2_column_number_density',
                      'NitrogenDioxide_stratospheric_NO2_column_number_density',
                      'NitrogenDioxide_NO2_slant_column_number_density',
                      'NitrogenDioxide_tropopause_pressure',
                      'NitrogenDioxide_absorbing_aerosol_index',
                      'NitrogenDioxide_cloud_fraction',
                      'NitrogenDioxide_sensor_altitude',
                      'NitrogenDioxide_sensor_azimuth_angle',
                      'NitrogenDioxide_sensor_zenith_angle',
                      'NitrogenDioxide_solar_azimuth_angle',
                      'NitrogenDioxide_solar_zenith_angle',
                      'Formaldehyde_tropospheric_HCHO_column_number_density',
                      'Formaldehyde_tropospheric_HCHO_column_number_density_amf',
                      'Formaldehyde_cloud_fraction',
                      'Formaldehyde_sensor_zenith_angle',
                      'Formaldehyde_sensor_azimuth_angle',
                      'UvAerosolIndex_absorbing_aerosol_index',
                      'UvAerosolIndex_sensor_altitude',
                      'UvAerosolIndex_sensor_azimuth_angle',
                      'Ozone_O3_column_number_density',
                      'Ozone_O3_column_number_density_amf',
                      'Ozone_O3_effective_temperature',
                      'Ozone_solar_zenith_angle',
                      'Cloud_cloud_fraction',
                      'Cloud_cloud_top_pressure',
                      'Cloud_cloud_top_height',
                      'Cloud_cloud_optical_depth',
                      'Cloud_surface_albedo',
                      'Cloud_sensor_azimuth_angle',
                      'Cloud_sensor_zenith_angle']
    
    df = df[ filtered_cols ]
    
    # List of columns to Winsorize
    columns_to_winsorize = df.loc[:, 'SulphurDioxide_SO2_column_number_density_amf':'Cloud_sensor_zenith_angle'].columns

    # Apply Winsorization to each specified column separately
    for col in columns_to_winsorize:
        df[col] = winsorize_with_pandas(df[col], (0.05, 0.95))
    
    df = missing_values_imputer(df)
    
    return df
    
#     global StandardScaler
#     df_scaled = StandardScaler.transform(df)
    
#     global pca
#     df_pca = pca.transform(df_scaled)
                
#     return df_pca

In [None]:
def df_transformer_WithoutWinsorize(dataframe):
    
    def missing_values_imputer(dataframe):
        df = dataframe.copy()

        symmetrical_cols_counter = 0
        skewed_cols_counter = 0
        categorical_cols_counter = 0

        for col in df.columns:

            if df[col].isnull().sum() != 0:

                if pd.api.types.is_numeric_dtype(df[col]): #Checking if numeric col
                    skewness = df[col].skew()

                    if -0.5 <= skewness <= 0.5:  # Symmetrical case
                        mean = df[col].mean()
                        df[col] = df[col].fillna(mean) # Replacing with mean
                        symmetrical_cols_counter += 1

                    else:  # Negative or positive skewed case
                        median = df[col].median()
                        df[col] = df[col].fillna(median) # Replacing with median
                        skewed_cols_counter += 1

                else: #For categorical col
                    mode = df[col].mode().values[0]
                    df[col] = df[col].fillna(mode) # Replacing with mode
                    categorical_cols_counter += 1

        return df

    df = dataframe.copy()
    
    df = df.drop('ID_LAT_LON_YEAR_WEEK', axis=1)
    
    filtered_cols = [ 'latitude',
                      'longitude',
                      'year',
                      'week_no',
                      'SulphurDioxide_SO2_column_number_density_amf',
                      'SulphurDioxide_cloud_fraction',
                      'SulphurDioxide_SO2_column_number_density_15km',
                      'CarbonMonoxide_CO_column_number_density',
                      'CarbonMonoxide_H2O_column_number_density',
                      'CarbonMonoxide_cloud_height',
                      'CarbonMonoxide_sensor_altitude',
                      'CarbonMonoxide_sensor_azimuth_angle',
                      'CarbonMonoxide_sensor_zenith_angle',
                      'NitrogenDioxide_NO2_column_number_density',
                      'NitrogenDioxide_stratospheric_NO2_column_number_density',
                      'NitrogenDioxide_NO2_slant_column_number_density',
                      'NitrogenDioxide_tropopause_pressure',
                      'NitrogenDioxide_absorbing_aerosol_index',
                      'NitrogenDioxide_cloud_fraction',
                      'NitrogenDioxide_sensor_altitude',
                      'NitrogenDioxide_sensor_azimuth_angle',
                      'NitrogenDioxide_sensor_zenith_angle',
                      'NitrogenDioxide_solar_azimuth_angle',
                      'NitrogenDioxide_solar_zenith_angle',
                      'Formaldehyde_tropospheric_HCHO_column_number_density',
                      'Formaldehyde_tropospheric_HCHO_column_number_density_amf',
                      'Formaldehyde_cloud_fraction',
                      'Formaldehyde_sensor_zenith_angle',
                      'Formaldehyde_sensor_azimuth_angle',
                      'UvAerosolIndex_absorbing_aerosol_index',
                      'UvAerosolIndex_sensor_altitude',
                      'UvAerosolIndex_sensor_azimuth_angle',
                      'Ozone_O3_column_number_density',
                      'Ozone_O3_column_number_density_amf',
                      'Ozone_O3_effective_temperature',
                      'Ozone_solar_zenith_angle',
                      'Cloud_cloud_fraction',
                      'Cloud_cloud_top_pressure',
                      'Cloud_cloud_top_height',
                      'Cloud_cloud_optical_depth',
                      'Cloud_surface_albedo',
                      'Cloud_sensor_azimuth_angle',
                      'Cloud_sensor_zenith_angle']
    
    df = df[ filtered_cols ]
    
    df = missing_values_imputer(df)
    
    return df
    
#     global StandardScaler
#     df_scaled = StandardScaler.transform(df)
    
#     global pca
#     df_pca = pca.transform(df_scaled)
                
#     return df_pca

In [None]:
test_df1 = df_transformer_WithWinsorize(test_df)
test_df2 = df_transformer_WithoutWinsorize(test_df)

In [None]:
test_df1.shape

In [None]:
test_df2.shape

In [None]:
# predicting on the test data
submission_y = best_estimator.predict(test_df1)
submission = pd.read_csv('/kaggle/input/playground-series-s3e20/sample_submission.csv')
submission['emission'] = submission_y
submission.to_csv('submission_with_winsorize.csv', index=False)
submission

In [None]:
# predicting on the test data
submission_y = best_estimator.predict(test_df2)
submission = pd.read_csv('/kaggle/input/playground-series-s3e20/sample_submission.csv')
submission['emission'] = submission_y
submission.to_csv('submission_without_winsorize.csv', index=False)
submission