In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
np.set_printoptions(legacy="1.25")
import seaborn as sns
import matplotlib.pyplot as plt
import math
import scipy

from sklearn import set_config
set_config(transform_output='pandas')

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score, 
                             mean_absolute_percentage_error, root_mean_squared_error)
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures, FunctionTransformer
from sklearn.tree import DecisionTreeRegressor, plot_tree

import category_encoders as ce
from category_encoders.hashing import HashingEncoder
from category_encoders.ordinal import OrdinalEncoder

from xgboost import XGBRegressor

import joblib

In [None]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_proc, y_train)
depths = range(1, dt.get_depth()+1)
depth_df = pd.DataFrame(index=depths, columns=["Training log RMSE", "Test log RMSE", "Distance from training"])

for depth in depths:
    # try max depth in a decision tree
    dt_temp = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dt_temp.fit(X_train_proc, y_train)
    train_pred = dt_temp.predict(X_train_proc)
    trainRMSE = root_mean_squared_error(np.log1p(np.clip(y_train, a_min=0, a_max=np.max(y_train))), np.log1p(np.clip(train_pred, a_min=0, a_max=np.max(train_pred))))
    test_pred = dt_temp.predict(X_test_proc)
    testRMSE = root_mean_squared_error(np.log1p(np.clip(y_test, a_min=0, a_max=np.max(y_test))), np.log1p(np.clip(test_pred, a_min=0, a_max=np.max(test_pred))))
    depth_df.loc[depth, "Training log RMSE"] = trainRMSE
    depth_df.loc[depth, "Test log RMSE"] = testRMSE
    depth_df.loc[depth, "Distance from training"] = np.abs(trainRMSE - testRMSE)

depth_df.sort_values(by=["Test log RMSE"])

In [3]:
Lasso().get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [4]:
Ridge().get_params()


{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.0001}

In [5]:
ElasticNet().get_params()


{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'l1_ratio': 0.5,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}