# 30 days of ML @ Kaggle

## About the data

The dataset is used for this competition is synthetic (and generated using a CTGAN), but based on a real dataset. The original dataset deals with predicting the amount of an insurance claim. Although the features are anonymized, they have properties relating to real-world features.

For this competition, you will be predicting a continuous target based on a number of feature columns given in the data. All of the feature columns, cat0 - cat9 are categorical, and the feature columns cont0 - cont13 are continuous.

Files:
- train.csv - the training data with the target column
- test.csv - the test set; you will be predicting the target for each row in this file
- sample_submission.csv - a sample submission file in the correct format


## Inital setup

In [1]:
import pandas as pd

train_path = '../input/30daysofml/train.csv/train.csv'
test_path = '../input/30daysofml/test.csv/test.csv'

# All features + target
raw_train = pd.read_csv(train_path)
# All features, no target
raw_test = pd.read_csv(test_path)

## Preprocessing 

In [2]:
# Remove rows with missing target, separate id and target from predictors
X_full = raw_train.dropna(axis=0, subset=['target'], inplace=False).copy()
y = X_full.target
X_full.drop(['id','target'], axis=1, inplace=True)

In [3]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.75,
                                                                random_state=0)

In [4]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 16 and 
                    X_train_full[cname].dtype == "object"]

In [5]:
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

In [6]:
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = raw_test[my_cols].copy()

## EDA for Feature Engineering

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = raw_train.copy()
[df[col].unique() for col in categorical_cols]

[array(['B', 'A'], dtype=object),
 array(['B', 'A'], dtype=object),
 array(['B', 'A'], dtype=object),
 array(['C', 'A', 'D', 'B'], dtype=object),
 array(['B', 'C', 'A', 'D'], dtype=object),
 array(['B', 'D', 'C', 'A'], dtype=object),
 array(['A', 'B', 'C', 'H', 'D', 'I', 'G', 'E'], dtype=object),
 array(['E', 'F', 'D', 'B', 'G', 'C', 'A', 'I'], dtype=object),
 array(['C', 'A', 'G', 'E', 'F', 'D', 'B'], dtype=object),
 array(['N', 'O', 'F', 'K', 'M', 'I', 'G', 'H', 'L', 'B', 'A', 'J', 'D',
        'C', 'E'], dtype=object)]

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression

def is_ordinal(col):
    # 1 pick a variable
    variable = df[[col]]
    # 2 transform it by ordinal encoder
    OD = OrdinalEncoder()
    OD_var = OD.fit_transform(variable)
    # 3 run a linear regression of your variable's ordinal encoding against your target
    target = df[['target']]
    reg_OD = LinearRegression().fit(OD_var, target)
    # 4 inspect the r2 coefficients 
    score_OD = reg_OD.score(OD_var, target)
    print(f'{col} Ordinal r2 score: {score_OD}')
    # 5 the coefficients far from zero (both in positive and negative) point out that an encoding is meaningful. 
    # AUX Print some plots
    #plt.scatter(x=OD_var, y=target, color='blue', alpha=0.05)
    #plt.title(f'{col}')
    #plt.show()

Just run once for EDA purposes

In [10]:
#for col in categorical_cols:
    #is_ordinal(col)

Guided by intuition, after inspecting the results of the exploration above, I'll try separating some of the categorical cols into ordinal and nominal cols

In [11]:
ordinal_cols = ['cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8']
nominal_cols = ['cat0', 'cat1', 'cat2', 'cat9']

In [12]:
# Keep selected columns only
my_cols = ordinal_cols + nominal_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = raw_test[my_cols].copy()

## Hyper parameter tuning of the model

In [13]:
#Importing Packages
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error

In [14]:
#XGBoost hyper-parameter tuning
def hyperParameterTuning(X_train, y_train):
    param_tuning = {
        'learning_rate': [0.02,0.05, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror'],
        'tree_method': ['gpu_hist']
    }

    xgb_model = XGBRegressor()

    gsearch = GridSearchCV(estimator = xgb_model,
                           param_grid = param_tuning,                        
                           #scoring = 'neg_mean_squared_error',  #MSE
                           cv = 5,
                           n_jobs = -1,
                           verbose = 1)

    gsearch.fit(X_train,y_train)

    return gsearch.best_params_

In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [16]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for ordinal data
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OrdinalEncoder())
])

# Preprocessing for nominal data
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('nom', nominal_transformer, nominal_cols)
    ])

In [17]:
#Run only in the first run of the kernel.
#X_train_trans = preprocessor.fit_transform(X_train)
#hyperParameterTuning(X_train_trans, y_train)

## Best params output

{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 3,
 'n_estimators': 500,
 'objective': 'reg:squarederror',
 'subsample': 0.7,
 'tree_method': 'gpu_hist'}

## Pipeline building

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for ordinal data
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OrdinalEncoder())
])

# Preprocessing for nominal data
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('ord', ordinal_transformer, ordinal_cols),
        ('nom', nominal_transformer, nominal_cols)
    ])

# Define the model. Played around a bit with the best fit parameters
model = XGBRegressor(colsample_bytree=0.5, 
                     max_depth=5, 
                     min_child_weight=3, 
                     subsample=0.7, 
                     n_estimators=1000, 
                     learning_rate=0.05, 
                     n_jobs=-1,
                     tree_method='gpu_hist')

# Auxiliar steps. Fit the preprocessor to X_train and transform X_valid in order
# to be able to pass it as a paremeter to the fit step of the pipeline
preprocessor.fit(X_train)
X_valid_trans = preprocessor.transform(X_valid)

# Bundle preprocessing and modeling code in a pipeline
full_pl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
full_pl.fit(X_train, y_train, model__early_stopping_rounds=10,
                              model__eval_set=[(X_valid_trans, y_valid)],
                              model__verbose=False)

# Preprocessing of validation data, get predictions
preds = full_pl.predict(X_valid)

print('RMSE:', mean_squared_error(y_valid, preds, squared=False))

RMSE: 0.7223566533464848


# Submission generation 

In [19]:
preds_test = full_pl.predict(X_test) 

# Save test predictions to file
output = pd.DataFrame({'Id': raw_test.id,
                       'target': preds_test})
output.to_csv('submission.csv', index=False)



for col in categorical_cols:
    ax = sns.violinplot(x=df[col], y=df['target'], inner='quartile', color='white')
    ax.set_xlabel(col)
    ax.set_ylabel('target')
    plt.show()

## Plot results

In [20]:
#results=pd.DataFrame()
#results['columns']= pd.get_dummies(X_valid).columns
#results['importances'] = model.feature_importances_
#results.sort_values(by='importances',ascending=False,inplace=True)
#results.to_csv('feature_importances.csv', index=False)

In [21]:
#import numpy as np

In [22]:
#Plot Real vs Predict
#for col in numerical_cols:
    #plt.scatter(np.log(X_valid[col]**4), np.log(y_valid**2), color='blue', label='Real', alpha=0.5)
    #plt.scatter(np.log(X_valid[col]**4), np.log(preds**2), color='red', label='Predict', alpha=0.5)
    #plt.title(col)
    #plt.legend(loc='best')
    #plt.show()