In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
import math
from scipy import stats
plt.rc("figure", figsize=(160,80))
plt.rc("font", size=14)

In [None]:
from interpretableai import iai
import pandas as pd
import numpy as np

In [None]:
training_df=pd.read_csv('/Users/Ben/Desktop/optimal_decision_trees/data/modelling.csv')
kaggle_df=pd.read_csv('/Users/Ben/Desktop/optimal_decision_trees/data/test.csv')
holdout_df=pd.read_csv('/Users/Ben/Desktop/optimal_decision_trees/data/holdout.csv')

In [None]:
# # frequency binning
# threshold = 0.7
# m = df['city'].map(df.groupby('city')['city'].size().sort_values(ascending=False).div(len(df)).cumsum().le(threshold))

# df['city'] = np.where(m, df['city'], 'Other')

In [None]:
def scoring_cols(df1, df2):
    
    ''' function to compare the levels in two dataframes and return the columns which align '''

    # find categoric columns
    all_columns = list(df2.columns)
    numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'uint8']
    numeric_columns = df2.select_dtypes(include=numeric_types).columns.to_list()
    categoric_columns = list(set(all_columns) - set(numeric_columns))

    # create empty list for columns and indicators
    cols=[]
    cols_ind=[]

    # loop through all categoric columns and create indicators
    for i in categoric_columns:
        df_t_unique=sorted(df1[i].unique())
        df_te_unique=sorted(df2[i].unique())
        cols.append(i)
        cols_ind.append(df_t_unique==df_te_unique)

    # filter for relevant columns
    d = {'cols':cols,'cols_ind':cols_ind}
    df = pd.DataFrame(d)
    cat_cols=df.loc[df['cols_ind']==True]['cols']

    # combine with numerics and add loss
    modelling_cols = numeric_columns + sorted(list(cat_cols))
    modelling_cols.append('loss')

    return modelling_cols

In [None]:
# scoring_cols=scoring_cols(training_df, kaggle_df)

In [None]:
def training_validation_subset(df):
    ''' function to create training and validation subsets
        chosen this methodology as a method to replicate in the future '''

    training_df = df.sample(frac=0.7)
    print('Modelling dataset rows:\t', training_df.shape[0])

    validation_df = pd.concat([df, training_df]).drop_duplicates(keep=False)
    print('Validation dataset rows:\t', validation_df.shape[0])

    return training_df, validation_df

# train_df=training_df.loc[training_df.index<=160000]
# modelling_df, validation_df=training_validation_subset(train_df)

In [None]:
df=training_df

In [None]:
def categorical_encoding(df):

    # find all relevant columns
    all_columns = list(df.columns)
    numeric_types = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'uint8']
    numeric_columns = df.select_dtypes(include=numeric_types).columns.to_list()
    categoric_columns = list(set(all_columns) - set(numeric_columns))

    for i in categoric_columns:
        df[i] = df[i].astype('category')
        
categorical_encoding(df)

In [None]:
# group up categoric values with many levels
# look for cols with many levels
# group up based on response 
# segment into n levels / groups

# or frequency

We will use a GridSearch to fit an OptimalTreeRegressor:

In [None]:
cols=list(df.columns)
X = df.loc[:, ~df.columns.isin(['loss', 'id'])]
y = np.log(df['loss'])

In [None]:
(train_X, train_y), (test_X, test_y) = iai.split_data('regression', X, y,
                                                      train_proportion=0.75,
                                                      seed=1)

In [None]:
# We need to choose a starting value for regression_lambda. You can either use the default value, or find a good starting estimate yourself.
# One approach to doing this yourself cheaply is to validate over regression_lambda with max_depth fixed to zero - this is effectively just fitting a linear regression to the data and allows you to find a good baseline level of regularization:

# grid = iai.GridSearch(
#     iai.OptimalTreeRegressor(
#         random_seed=2,
#         max_depth=0,
#         #regression_sparsity=:all,
#     ),
#     regression_lambda=[0.0001, 0.001, 0.01, 0.1],
# )
# iai.fit!(grid, X, y)
# starting_lambda = IAI.get_best_params(grid)[:regression_lambda]

In [None]:
# Using the starting estimate from Step 1 for regression_lambda, we now tune max_depth:

# grid = IAI.GridSearch(
#     IAI.OptimalTreeRegressor(
#         random_seed=1,
#         regression_sparsity=:all,
#         regression_lambda=starting_lambda,
#     ),
#     max_depth=1:3,
# )
# IAI.fit!(grid, X, y)
# best_depth = IAI.get_best_params(grid)[:max_depth]

In [None]:
# Finally, we fix max_depth to the value found in Step 2, and tune regression_lambda to get the final result:

# grid = IAI.GridSearch(
#     IAI.OptimalTreeRegressor(
#         random_seed=1,
#         max_depth=best_depth,
#         regression_sparsity=:all,
#     ),
#     regression_lambda=[0.0001, 0.001, 0.01, 0.1],
# )
# IAI.fit!(grid, X, y)
# IAI.get_best_params(grid)

In [None]:
grid = iai.GridSearch(
    iai.OptimalTreeRegressor(
        random_seed=123,
    ),
    max_depth=range(1, 10),
)
grid.fit(train_X, train_y) # https://docs.interpretable.ai/stable/OptimalTrees/tuning/
grid.get_learner()

We can make predictions on new data using predict:

In [None]:
# Train Result
y_pred=np.exp(grid.predict(train_X))
result=mean_absolute_error(y_pred, train_y)
print('train result: ', result)

# Test Result
y_pred=np.exp(grid.predict(test_X))
result=mean_absolute_error(y_pred, test_y)
print('test result: ', result)

# Scoring and Save

In [None]:
# Holdout df
X_test= holdout_df[cols].loc[:, ~holdout_df.columns.isin(['loss', 'id'])]

# Encode
categorical_encoding(X_test)

grid.predict(X_test)

y_pred=np.exp(grid.predict(X_test))
y_valid=holdout_df['loss']
result=mean_absolute_error(y_pred, y_valid)
print('result: ', result)

idx=holdout_df['id']
d = {'id':idx,'loss':y_pred}
out_df=pd.DataFrame(d)
out_df.to_csv('/Users/Ben/Desktop/optimal_decision_trees/outputs/holdout_odt_predictions.csv', index=False)

In [None]:
# Kaggle df
X_kaggle= kaggle_df.loc[:, ~kaggle_df.columns.isin(['id'])]

# Encode
categorical_encoding(X_kaggle)

idx=kaggle_df['id']
y_pred=np.exp(grid.predict(X_kaggle))
d = {'id':idx,'loss':y_pred}
out_df=pd.DataFrame(d)
out_df.to_csv('/Users/Ben/Desktop/optimal_decision_trees/outputs/kaggle_odt_predictions.csv', index=False)

# Save model

In [None]:
grid.write_html('/Users/Ben/Desktop/optimal_decision_trees/outputs/odt_model.html')
grid.write_json('/Users/Ben/Desktop/optimal_decision_trees/outputs/odt_model.json')
#lnr = IAI.read_json("learner.json") https://docs.interpretable.ai/stable/IAIBase/learner/#Parameters