In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
import math
from scipy import stats
plt.rc("figure", figsize=(160,80))
plt.rc("font", size=14)

In [2]:
from interpretableai import iai
import pandas as pd
import numpy as np

In [3]:
training_df=pd.read_csv('/Users/Ben/Desktop/optimal_decision_trees/data/modelling.csv')
kaggle_df=pd.read_csv('/Users/Ben/Desktop/optimal_decision_trees/data/test.csv')
holdout_df=pd.read_csv('/Users/Ben/Desktop/optimal_decision_trees/data/holdout.csv')

In [4]:
def return_categoric_columns(df):
    
    ''' function to return categoric columns '''

    all_columns = list(df.columns)
    numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'uint8']
    numeric_columns = df.select_dtypes(include=numeric_types).columns.to_list()
    categoric_columns = list(set(all_columns) - set(numeric_columns))
    return categoric_columns

In [5]:
def frequency_bin_cols(df, threshold):
    
    ''' function to group up values with low frequency based on a specified threshold'''

    # empty lists
    fe_cols=[]
    fe_transform=[]

    for i in categoric_columns:

        if len(df[i].unique())>=8:

            fe_cols.append(i)
            fe_transform.append(df[i].map(df.groupby(i)[i].size().sort_values(ascending=False).div(len(df)).cumsum().le(threshold)))

    for u, v in zip(fe_cols, fe_transform):
        
        df[u] = np.where(v, df[u], 'Other')

In [6]:
def frequency_transformer(df1, df2, col):

    ''' function to transform datasets to match '''
    
    list1=list(sorted(df1[col].unique()))
    list2=list(sorted(df2[col].unique()))

    intersection=list(set(list1).intersection(list2))

    df2[col]=np.where(df2[col].isin(intersection),df2[col],'Other')

In [7]:
def scoring_cols(df1, df2):
    
    ''' function to compare the levels in two dataframes and return the columns which align '''

    # find categoric columns
    all_columns = list(df2.columns)
    numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'uint8']
    numeric_columns = df2.select_dtypes(include=numeric_types).columns.to_list()
    categoric_columns = list(set(all_columns) - set(numeric_columns))

    # create empty list for columns and indicators
    cols=[]
    cols_ind=[]

    # loop through all categoric columns and create indicators
    for i in categoric_columns:
        df_t_unique=sorted(df1[i].unique())
        df_te_unique=sorted(df2[i].unique())
        cols.append(i)
        cols_ind.append(df_t_unique==df_te_unique)

    # filter for relevant columns
    d = {'cols':cols,'cols_ind':cols_ind}
    df = pd.DataFrame(d)
    cat_cols=df.loc[df['cols_ind']==True]['cols']

    # combine with numerics and add loss
    modelling_cols = numeric_columns + sorted(list(cat_cols))
    modelling_cols.append('loss')

    return modelling_cols

In [8]:
# scoring_cols=scoring_cols(training_df, kaggle_df)

In [9]:
def training_validation_subset(df):
    ''' function to create training and validation subsets
        chosen this methodology as a method to replicate in the future '''

    training_df = df.sample(frac=0.7)
    print('Modelling dataset rows:\t', training_df.shape[0])

    validation_df = pd.concat([df, training_df]).drop_duplicates(keep=False)
    print('Validation dataset rows:\t', validation_df.shape[0])

    return training_df, validation_df

# train_df=training_df.loc[training_df.index<=160000]
# modelling_df, validation_df=training_validation_subset(train_df)

In [10]:
# df=training_df

In [11]:
def categorical_encoding(df):

    # find all relevant columns
    all_columns = list(df.columns)
    numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'uint8']
    numeric_columns = df.select_dtypes(include=numeric_types).columns.to_list()
    categoric_columns = list(set(all_columns) - set(numeric_columns))

    for i in categoric_columns:
        df[i] = df[i].astype('category')

In [12]:
# group up categoric values with many levels
# look for cols with many levels
# group up based on response 
# segment into n levels / groups

# or frequency

# Feature Engineering

In [13]:
def response_outlier_capping(df, variable, multiplier):

    ''' windsorise the response variable '''

    q1 = np.percentile(df[variable],25)
    q3 = np.percentile(df[variable],75)
    iqr = q3 - q1
    lower = q1 - (iqr * multiplier)
    upper = q3 + (iqr * multiplier)

    df[variable] = np.where(df[variable]<=lower, lower, df[variable])
    df[variable] = np.where(df[variable]>=upper, upper, df[variable])

    return df

def log_response(df, response):

    ''' take the natural log of the response variable '''

    print('Skewness of untransformed response:\t' + str(df[response].skew()))

    # transform response column to ensure +ve
    minimum_val = math.ceil(min(abs(np.log(df[response]))))
    original_data = np.log(df[response]) + minimum_val
    df[response] = np.log(df[response])
    print('Skewness of transformed response:\t' + str(df[response].skew()))

    return df

training_df=response_outlier_capping(training_df, 'loss', 2.2)

In [14]:
categoric_columns=return_categoric_columns(training_df)
frequency_bin_cols(training_df, 0.8)
categorical_encoding(training_df)

We will use a GridSearch to fit an OptimalTreeRegressor:

In [15]:
cols=list(training_df.columns)
X = training_df.loc[:, ~training_df.columns.isin(['loss', 'id'])]
y = np.log(training_df['loss'])

In [16]:
(train_X, train_y), (test_X, test_y) = iai.split_data('regression', X, y,
                                                      train_proportion=0.90,
                                                      seed=1)

In [17]:
# We need to choose a starting value for regression_lambda. You can either use the default value, or find a good starting estimate yourself.
# One approach to doing this yourself cheaply is to validate over regression_lambda with max_depth fixed to zero - this is effectively just fitting a linear regression to the data and allows you to find a good baseline level of regularization:

# grid = iai.GridSearch(
#     iai.OptimalTreeRegressor(
#         random_seed=2,
#         max_depth=0,
#         #regression_sparsity=:all,
#     ),
#     regression_lambda=[0.0001, 0.001, 0.01, 0.1],
# )
# iai.fit!(grid, X, y)
# starting_lambda = IAI.get_best_params(grid)[:regression_lambda]

In [18]:
# Using the starting estimate from Step 1 for regression_lambda, we now tune max_depth:

# grid = IAI.GridSearch(
#     IAI.OptimalTreeRegressor(
#         random_seed=1,
#         regression_sparsity=:all,
#         regression_lambda=starting_lambda,
#     ),
#     max_depth=1:3,
# )
# IAI.fit!(grid, X, y)
# best_depth = IAI.get_best_params(grid)[:max_depth]

In [19]:
# Finally, we fix max_depth to the value found in Step 2, and tune regression_lambda to get the final result:

# grid = IAI.GridSearch(
#     IAI.OptimalTreeRegressor(
#         random_seed=1,
#         max_depth=best_depth,
#         regression_sparsity=:all,
#     ),
#     regression_lambda=[0.0001, 0.001, 0.01, 0.1],
# )
# IAI.fit!(grid, X, y)
# IAI.get_best_params(grid)

In [None]:
maxdepth=9
grid = iai.GridSearch(
    iai.OptimalTreeRegressor(
        random_seed=123,
    ),
    max_depth=maxdepth,
)
grid.fit(train_X, train_y) # https://docs.interpretable.ai/stable/OptimalTrees/tuning/
grid.get_learner()

# range(9, maxdepth),

We can make predictions on new data using predict:

In [None]:
# Train Result
y_pred=np.exp(grid.predict(train_X))
result=mean_absolute_error(y_pred, train_y)
print('train result: ', result)

# Test Result
y_pred=np.exp(grid.predict(test_X))
result=mean_absolute_error(y_pred, test_y)
print('test result: ', result)

# Scoring and Save

In [None]:
holdout_cats=return_categoric_columns(holdout_df)

for i in holdout_cats:
    frequency_transformer(training_df, holdout_df, i)

In [None]:
# Holdout df
X_test= holdout_df[cols].loc[:, ~holdout_df.columns.isin(['loss', 'id'])]

# Encode
categorical_encoding(X_test)

grid.predict(X_test)

y_pred=np.exp(grid.predict(X_test))
y_valid=holdout_df['loss']
result=mean_absolute_error(y_pred, y_valid)
print('result: ', result)

idx=holdout_df['id']
d = {'id':idx,'loss':y_pred}
out_df=pd.DataFrame(d)
out_df.to_csv(f'/Users/Ben/Desktop/optimal_decision_trees/outputs/holdout_odt_predictions_maxdepth{maxdepth}.csv', index=False)

In [None]:
kaggle_cats=return_categoric_columns(kaggle_df)

for i in holdout_cats:
    frequency_transformer(training_df, kaggle_df, i)

In [None]:
# Kaggle df
X_kaggle= kaggle_df.loc[:, ~kaggle_df.columns.isin(['id'])]

# Encode
categorical_encoding(X_kaggle)

idx=kaggle_df['id']
y_pred=np.exp(grid.predict(X_kaggle))
d = {'id':idx,'loss':y_pred}
out_df=pd.DataFrame(d)
out_df.to_csv(f'/Users/Ben/Desktop/optimal_decision_trees/outputs/kaggle_odt_predictions_maxdepth{maxdepth}.csv', index=False)

# Save model

In [None]:
grid.write_html(f'/Users/Ben/Desktop/optimal_decision_trees/outputs/odt_model_maxdepth{maxdepth}.html')
grid.write_json(f'/Users/Ben/Desktop/optimal_decision_trees/outputs/odt_model_maxdepth{maxdepth}.json')
#lnr = IAI.read_json("learner.json") https://docs.interpretable.ai/stable/IAIBase/learner/#Parameters