In [None]:
%load_ext autoreload
%autoreload 2

from sklearn.utils import shuffle
import plotly.express as px

import numpy as np
import pandas as pd
import pandera as pa
pd.options.display.max_rows = 100
pd.options.display.max_columns = 200

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("../")
import src


# Explore and Prepare Data

In [None]:
# --------------------------------
# CREATE NEW (EMPTY) MODEL
# --------------------------------
model = src.ClassifierModel()

In [None]:
model.input_data = shuffle(pd.read_csv('../data/aug_train.csv'))
model.input_data.head(5)

In [None]:
model.input_data.describe()

In [None]:
model.model_settings = {
    'numerical_columns' : ['Age','Annual_Premium','Vintage'],
    'categorical_columns' : ['Gender','Driving_License','Region_Code','Previously_Insured','Vehicle_Age','Policy_Sales_Channel', 'Vehicle_Damage'],
    'id_columns' : ['id'],
    'label_column' : ['Response']

    #'nullable_columns' : ''
    # expected dtype
}

In [None]:
print(f"null values for all columns:\n----------------------- \n{model.input_data.isna().sum()}")

In [None]:
print(f"categories for categorical columns \n ------------------------------")
for i in model.model_settings['categorical_columns']:
    print(f"{i}: {model.input_data[i].unique()} \n _____")

In [None]:
#CREATE PANDERA INPUT SCHEMA
input_schema = pa.DataFrameSchema({
    
    # ColumnName : pa.Column(pa.DataType) 
    # In case required, add new columns with this format

    'id':pa.Column(pa.Int),

    'Age':pa.Column(pa.Int, nullable = True),
    'Annual_Premium':pa.Column(pa.Float, nullable = True),
    'Vintage':pa.Column(pa.Int, nullable = True),

    'Gender':pa.Column(pa.Category, nullable = True),
    'Driving_License':pa.Column(pa.Category, nullable = True),
    'Region_Code':pa.Column(pa.Category, nullable = True),
    'Previously_Insured':pa.Column(pa.Category, nullable = True),
    'Vehicle_Damage':pa.Column(pa.Category, nullable = True),
    'Vehicle_Age':pa.Column(pa.Category, nullable = True),
    'Policy_Sales_Channel':pa.Column(pa.Category, nullable = True),

    'Response':pa.Column(pa.Int)
},strict = True)

In [None]:
# DROP COLUMNS WITH NULLS WHERE IT CAN'T BE NULLABLE
print(f"orginal input shape: {model.input_data.shape}")
model.input_data.dropna(subset = model.model_settings['id_columns']+model.model_settings['label_column'], inplace = True)
print(f"input shape after dropping non-nullable nulls: {model.input_data.shape}")

In [None]:
def fix_data_types(column: pd.DataFrame, expected_type: str)->np.array:
    """_summary_

    Args:
        column (pd.DataFrame): _description_
        expected_type (str): _description_

    Returns:
        pd.DataFrame: _description_
    """

    #ToDo_ Check if type is already as expected
    if expected_type == 'int':
        return column.astype(pd.core.arrays.integer.Int64Dtype()).values
        
    
    elif expected_type == 'object':
        return column.astype('object').values
    
    elif expected_type == 'float':
        return column.astype('float64').values

    elif expected_type == 'category':
        try: return model.input_data['id'].astype('int').astype('category').values
        except: return column.astype('object').astype('category').values

    else: return column
     

In [None]:
model.input_data["id"] = fix_data_types(model.input_data["id"], 'int')

model.input_data["Age"] = fix_data_types(model.input_data["Age"], 'int')
model.input_data["Annual_Premium"] = fix_data_types(model.input_data["Annual_Premium"], 'float')
model.input_data["Vintage"] = fix_data_types(model.input_data["Vintage"], 'int')


model.input_data["Gender"] = fix_data_types(model.input_data["Gender"], 'category')
model.input_data["Driving_License"] = fix_data_types(model.input_data["Driving_License"], 'category')
model.input_data["Region_Code"] = fix_data_types(model.input_data["Region_Code"], 'category')
model.input_data["Previously_Insured"] = fix_data_types(model.input_data["Previously_Insured"], 'category')
model.input_data["Vehicle_Damage"] = fix_data_types(model.input_data["Vehicle_Damage"], 'category')
model.input_data["Vehicle_Age"] = fix_data_types(model.input_data["Vehicle_Age"], 'category')
model.input_data["Policy_Sales_Channel"] = fix_data_types(model.input_data["Policy_Sales_Channel"], 'category')

model.input_data["Response"] = model.input_data['Response'].astype('int64')

In [None]:
print(f"data types for all columns:\n----------------------- \n{model.input_data.dtypes}")

In [None]:
#Validate with Pandera
input_schema.validate(model.input_data)

### Visualize Input Data

In [None]:
def viz_data_distribution(df:pd.DataFrame, column_name: str):

    fig = px.histogram(df, x = column_name, color = 'Response')
    fig.show()
    #return fig

In [None]:
for col in model.model_settings['numerical_columns']:
    viz_data_distribution(model.input_data, column_name=col)

# Set Up LGBM Model

In [None]:
# MODEL SETTINGS

model.model_settings['loss_function']='focal_loss' #Choose one from [focal_loss, log_loss]

model.model_settings['n_iters_loss'] = 2
model.model_settings['n_iters_hyperparams']= 2
model.model_settings['n_iters_boost_rounds']= 2


model.model_settings['optuna_eval_metric']='auc'
model.model_settings['clibrate_while_tunning']= True

model.model_settings['id_columns']= ['id']

print(model.model_settings)

In [None]:
train, validate, test = src.load_and_split_data(model)

In [None]:
model.preprocessor = src.create_preprocessor(train,model.model_settings)

# Tuning

##### Para comparar
+ logloss
+ focal loss all-at-once-tuning
+ focal loss two tunings and early stopping
+ focal loss two tunings and no early stop (i.e. mean(best_round))
+ focal loss two tunings and no early stop (i.e. mean(best_round)) calibrate while tunning

### Tuning LogLoss

In [None]:
params = src.model_tuning_logloss(model, train, validate)

### Tuning Focal Loss 1 step

In [None]:
params = src.model_tunning_focal_loss_1_step(model, train, validate)

### Tuning Focal Loss 2 steps

In [None]:
params = src.model_tunning_focal_loss_2_steps(model, train, validate)

# Training

# Evaluation