In [25]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor

# load sample dataset

In [21]:
if os.path.exists("predictions.csv"):
    
    print("existing predictions.csv loaded")
    df=pd.read_csv('predictions.csv')
    
else:
    
    print("new predictions.csv created")

    # read in prediction
    df=pd.read_csv("train.csv")

    # select the features to be used as inputs to the model
    features=['cont1','cont2', 'cont3', 'cont4']

    # select the target variable
    target='loss'

    # split train data into X and y
    X_train = df[features]
    y_train = df[target]

    # fit model to training data
    model = XGBRegressor()
    model.fit(X_train, y_train)

    # append to df
    df['pred']=model.predict(X_train)

    # rename for convenience
    df['actual']=df['loss']

    # export to df
    cols=['id', 'actual', 'pred']
    df[cols].to_csv('predictions.csv', index=False)
    df=df[cols]

new predictions.csv created


In [40]:
# create downsampling function

def downsample_func(df):

    # randomly select 75% of data with replacement
    df=df.sample(frac =.75, replace = True)
    return df

# calculate gini

In [30]:
def gini_table(df, pred, act):
    
    # returns a dataframe from which a gini coefficient can be calculated
    # also can create cumulative gains curves
    # pred = predicted values (output from emblem) argument required is the name of the column in df
    # act = actual values (number of claims) argument required is the name of the column in df
    
    # 3 useful outputs
    # Perc of Obs and Perc of Claims can be used to create Cumulative Gains Curves
    # Gini_Area can be used to calculate the gini coefficient. Each Gini_Area is the approximate area under Cumulative
    # gains curve. Feel free to change to trapezium rule in future.
    
    df = df.sort_values(by=pred, ascending=False)
    df = df.reset_index()
    df['Cumulative Claims'] = df[act].cumsum()
    df['Perc of Obs'] = (df.index + 1) / df.shape[0]
    df['Perc of Claims'] = df['Cumulative Claims'] / df.iloc[-1]['Cumulative Claims']
    df['gini_area'] = df['Perc of Claims'] / df.shape[0]
    return df

In [31]:
def calc_gini(df, model, obs):
    
    # uses output from gini_table to calculate a gini coefficient. Formula comes from R:\Pricing\Personal Lines Pricing - Motor\Technical\21. Provident\
    # 4. SAS Processes\Technical MI Tools\Gini_Coefficients_and_U_Statistics\1.Motivation - GiniCoefficientpaper.pdf
    # model = column name of modelled values you wish to calculate gini coefficient of.
    # obs = column name of actual values (number of claims)
    
    d1 = gini_table(df, model, obs)
    Gini_coef = round((d1.sum()['gini_area'] - 0.5) *2,6)
    return(Gini_coef)

In [32]:
# def rebase(df, anchor_col, col_list):
    
#     # rebases all columns in col_list to anchor_col
    
#     anchor_total = df[anchor_col].sum()
#     for col in col_list:
#         col_total = df[col].sum()
#         rebase_mult = anchor_total / col_total
#         df[col + " Rebased"] = df[col]*rebase_mult

In [44]:
def Average(lst):
    return sum(lst) / len(lst)

In [47]:
for i in range(1, 11):
    
    ginis=[]
    
    # create a downsampled dataframe
    down_df=downsample_func(df)
    
    # calculate gini on sample
    gini=calc_gini(down_df, 'pred', 'actual')
    ginis.append(gini)
        
    # print resulsts
    print('Run:', i, 'Gini:', gini)
    
# calculate average gini
average = Average(ginis)
print('Overall Average:', average)

Run: 1 Gini: 0.129879
Run: 2 Gini: 0.128126
Run: 3 Gini: 0.130869
Run: 4 Gini: 0.130702
Run: 5 Gini: 0.129732
Run: 6 Gini: 0.124549
Run: 7 Gini: 0.130455
Run: 8 Gini: 0.126693
Run: 9 Gini: 0.13148
Run: 10 Gini: 0.12933
Overall Average: 0.12933


In [None]:
# plot