In [1]:
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn import model_selection
from sklearn import metrics

%load_ext watermark
%watermark -v -p pandas,numpy,matplotlib,seaborn,plotly,sklearn

Python implementation: CPython
Python version       : 3.9.13
IPython version      : 7.28.0

pandas    : 1.5.3
numpy     : 1.24.1
matplotlib: 3.6.3
seaborn   : 0.12.2
plotly    : 5.9.0
sklearn   : 1.1.1



In [2]:
df_train = pd.read_csv('./kaggle/input/ml-olympiad-waterqualityprediction/train.csv')
df_test = pd.read_csv('./kaggle/input/ml-olympiad-waterqualityprediction/test.csv')
sample_sub = pd.read_csv('./kaggle/input/ml-olympiad-waterqualityprediction/sample_submission.csv')

In [3]:
df_train.drop('id', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)

In [4]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
featureA,12936.0,116559.2,146502.0,-83749.75,56744.75,75808.38,75808.38,582211.0
featureB,12936.0,4.526278,3.180817,1.064468,4.363288,4.45784,4.45784,25.98219
featureC,12936.0,0.1329723,0.2641483,0.0,0.005717646,0.005717646,0.03240702,0.7291935
featureD,12936.0,150.3411,75.09367,18.14855,122.2994,122.2994,137.9766,511.4759
featureE,12936.0,26.44618,6.665619,3.875411,24.6255,30.83191,30.83191,38.76723
featureF,13000.0,0.2175385,2.380893,0.0,0.0,0.0,0.0,100.0
featureG,12936.0,2335365000000.0,893729800000.0,28871500000.0,1464509000000.0,2806036000000.0,2806036000000.0,3781699000000.0
featureH,12936.0,15.04719,32.00037,0.0,0.0,0.0,0.0,87.15892
featureI,12936.0,50202100.0,24123600.0,530038.0,25867960.0,50941690.0,71176350.0,71176350.0
compositionA,12936.0,0.1649227,0.7432285,0.0,0.0,0.0,0.0,4.0


In [5]:
def plot_distributions_numerical(df):
    fig, axs = plt.subplots(len(numerical_cols) //2 + 1, 2, figsize=(12, 18))
    for ix, ax in enumerate(axs.flatten()):
        try:
            ax.set_title(f'# of unique: {df[numerical_cols[ix]].nunique()}, skew: {df[numerical_cols[ix]].skew():.3f}')
            sns.histplot(data=df, x=numerical_cols[ix], ax=ax)
        except:
            break
    plt.tight_layout()
    plt.show()
    
def plot_distributions_categoric(df):
    fig, axs = plt.subplots(len(categoric_cols) //2 + 1, 2, figsize=(12, 18))
    for ix, ax in enumerate(axs.flatten()):
        try:
            ax.set_title(f'# of unique: {df[categoric_cols[ix]].nunique()}')
            sns.countplot(data=df, x=categoric_cols[ix], ax=ax)
        except:
            break
    plt.tight_layout()
    plt.show()

In [6]:
from sklearn.preprocessing import RobustScaler

In [7]:
numerical_cols = df_train.drop('result', axis=1).select_dtypes(np.number).columns.to_list()
categoric_cols = df_train.drop('result', axis=1).select_dtypes('object').columns.to_list()

In [8]:
scalar = RobustScaler()
scalar.fit(df_train[numerical_cols])
df_train[numerical_cols] = scalar.transform(df_train[numerical_cols])

In [9]:
mapping_dict = {}

for col in categoric_cols:
    mapping_dict[col] = df_train[col].value_counts()[:50].index.to_list()

In [10]:
for col in categoric_cols:
    df_train[col] = df_train[col].apply(lambda x: x if x in mapping_dict[col] else 'unk')

In [11]:
mean_dict = {}

for col in numerical_cols:
    mean_dict[col] = df_train[col].mean()
    df_train[col] = df_train[col].fillna(df_train[col].mean())
    
mean_dict

{'featureA': 2.137624038777999,
 'featureB': 0.723802786075369,
 'featureC': 4.767990467791176,
 'featureD': 1.7886991614253678,
 'featureE': -0.7066451323952747,
 'featureF': 0.21753846153846154,
 'featureG': -0.35084770054678754,
 'featureH': 15.047185912375056,
 'featureI': -0.01632358483729607,
 'compositionA': 0.16492269635126777,
 'compositionB': -0.5495855634601781,
 'compositionC': -0.28584067252311657,
 'compositionD': 0.6684984443636874,
 'compositionE': 0.019061147186147206,
 'compositionF': 0.17869820655534943,
 'compositionG': 1.01215793135436,
 'compositionH': 0.6650750936465222,
 'compositionI': 0.22566905610850635,
 'compositionJ': -0.9612786023500299}

In [12]:
categoric_cols = df_train.drop('result', axis=1).select_dtypes('object').columns.to_list()

In [13]:
np.array(df_train[col].values).reshape(-1, 1)

array([[ 0.        ],
       [ 0.        ],
       [ 0.        ],
       ...,
       [ 0.        ],
       [16.20909091],
       [16.20909091]])

In [14]:
encoder_dict = {}

from sklearn.preprocessing import OrdinalEncoder

df_train.drop(['categoryB', 'categoryD', 'categoryF'], inplace=True, axis=1)
categoric_cols = df_train.drop('result', axis=1).select_dtypes('object').columns.to_list()

for col in categoric_cols:
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df_train[col] = enc.fit_transform(np.array(df_train[col].values).reshape(-1, 1))
    encoder_dict[col] = enc

In [15]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df_train.drop('result', axis=1), 
                                                                    df_train['result'], test_size=0.2)

In [16]:
def RMLSE(pred, true):
    return np.sqrt(np.mean((np.log(pred + 1) - np.log(true + 1))**2))

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
from sklearn import linear_model
from sklearn import ensemble
from xgboost import XGBRegressor, XGBRFRegressor

In [19]:
rmlse_metrics = metrics.make_scorer(RMLSE, greater_is_better=False)

In [22]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'criterion': ["squared_error", "absolute_error", "poisson"],
    'max_depth': [5, 10, 15, 20],
#     'min_sample_split': [2, 3, 4, 5],
}

model = ensemble.RandomForestRegressor()

reg = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5,
                   scoring=rmlse_metrics, verbose=10)

In [23]:
reg.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


KeyboardInterrupt: 

In [24]:
reg.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


KeyboardInterrupt: 