In [1]:
import datetime
import os
import sys
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
import dataframe_image as dfi
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter


notebook_dir = os.path.abspath(os.getcwd())
src_path = os.path.join('/home/lerceg/LukaPosao/market_models/', 'src')
sys.path.append(src_path)

import market_models.src.models.train_model as tm
import market_models.src.models.predict_model as pm
import market_models.src.utils as utils

%load_ext autoreload
%autoreload 
import market_models.src.features.build_features as build_features


In [8]:
data_name = 'netrisk_casco_2023_11_14__2023_11_20__2023_12_12__2023_12_22__2023_12_28__2023_12_29__2024_01_02'
target_variable = 'UNIQA_price'
#target_variable = 'GENERALI_price'


data_path = utils.get_processed_data_path(data_name)
features_path = utils.get_features_path(data_name)

data, features = utils.load_data(data_path, features_path, target_variable)

postal_code = data.drop_duplicates(['PostalCode'], keep = 'last')[['PostalCode', target_variable]].sort_values('PostalCode')
postal_code = postal_code.groupby(target_variable)['PostalCode'].agg(list)

In [9]:
data

Unnamed: 0,DateCrawled,isRecent,CarMake,CarAge,ccm,kw,kg,car_value,CarMakerCategory,PostalCode,PostalCode2,PostalCode3,Category,Longitude,Latitude,Age,LicenseAge,BonusMalus,BonusMalusCode,UNIQA_price
0,2023_11_14,False,28,3,998,49,899,15337.400,1.00,1016,10,101,9,19.0404,47.4984,40,18,1.0,2,115246.0
1,2023_11_14,False,20,2,999,92,1280,24388.000,1.00,1016,10,101,9,19.0404,47.4984,49,18,0.0,1,203799.0
2,2023_11_14,False,12,3,999,54,974,6757.400,0.97,1016,10,101,9,19.0404,47.4984,27,18,0.0,1,150166.0
3,2023_11_14,False,23,3,998,49,996,8447.400,0.97,1016,10,101,9,19.0404,47.4984,57,18,7.0,8,141611.0
4,2023_11_14,False,58,3,999,59,1270,11568.336,1.00,1016,10,101,9,19.0404,47.4984,43,18,7.0,8,162120.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6005,2024_01_02,True,66,1,1197,85,1390,25532.000,0.97,4100,41,410,1,21.5500,47.2167,35,18,0.0,1,158230.0
6006,2024_01_02,True,66,1,1197,85,1390,25532.000,0.97,2800,28,280,6,18.3933,47.5849,35,18,0.0,1,135286.0
6007,2024_01_02,True,66,1,1197,85,1390,25532.000,0.97,1107,11,110,8,19.0404,47.4984,35,18,0.0,1,169958.0
6008,2024_01_02,True,66,1,1197,85,1390,25532.000,0.97,5900,59,590,1,20.6667,46.5667,35,18,0.0,1,135286.0


In [4]:
bonus_malus = data.drop_duplicates('BonusMalus', keep = 'last')[['BonusMalus', target_variable]].sort_values('BonusMalus')

In [5]:
bonus_malus

Unnamed: 0,BonusMalus,UNIQA_price
804,0.0,192902.0
372,1.0,192902.0
242,2.0,193608.0
353,3.0,193608.0
366,4.0,193608.0
583,5.0,223520.0
81,6.0,223520.0
171,7.0,223520.0
480,8.0,235533.0
38,9.0,235533.0


In [6]:
bonus_malus[target_variable] = bonus_malus[target_variable].iloc[0] / bonus_malus[target_variable]

In [61]:
bonus_malus

Unnamed: 0,BonusMalus,GENERALI_price
804,0.0,1.0
372,1.0,0.793603
242,2.0,0.793603
353,3.0,0.793603
366,4.0,0.758332
583,5.0,0.758332
81,6.0,0.758332
171,7.0,0.758332
480,8.0,0.758332
38,9.0,0.758332


In [62]:
bonus_malus[target_variable].nunique()

3

In [67]:
age = data.drop_duplicates('Age', keep = 'last')[['Age', target_variable]].sort_values('Age')

In [68]:
age = age.groupby(target_variable).agg(list).reset_index()

In [69]:
age[target_variable] = age[target_variable] / age[target_variable].iloc[0]

In [70]:
age

Unnamed: 0,GENERALI_price,Age
0,1.0,[35]
1,1.077178,"[40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 6..."
2,1.14286,"[34, 36, 38]"
3,1.182267,"[70, 72, 74, 76]"
4,1.313631,"[30, 32]"
5,1.392448,[78]
6,1.839082,"[24, 26, 28]"
7,2.036125,"[18, 20, 22]"


In [48]:
car_age = data.drop_duplicates('CarAge', keep = 'last')[['CarAge', target_variable]].sort_values('CarAge')

In [49]:
car_age.groupby(target_variable).agg(list)

Unnamed: 0_level_0,CarAge
GENERALI_price,Unnamed: 1_level_1
96460.0,[4]
132231.0,[6]
193357.0,[9]
221142.0,[2]
238575.0,[5]
248212.0,[7]
331262.0,[1]
393022.0,[8]
2066529.0,[3]


In [50]:
car_model = data.drop_duplicates(['CarAge', 'CarMake'], keep = 'last').sort_values('CarMake')

In [51]:
car_model.groupby(target_variable)[('CarAge', 'CarMake')].agg(list)

  car_model.groupby(target_variable)[('CarAge', 'CarMake')].agg(list)


Unnamed: 0_level_0,CarAge,CarMake
GENERALI_price,Unnamed: 1_level_1,Unnamed: 2_level_1
85443.0,[7],[58]
86979.0,[7],[30]
91675.0,[8],[19]
96460.0,"[4, 5]","[29, 29]"
101907.0,[7],[45]
...,...,...
2066529.0,"[2, 3]","[45, 45]"
2562293.0,[8],[18]
3359839.0,[3],[18]
3497493.0,[6],[18]


In [52]:
postal_code = data.drop_duplicates(['PostalCode'], keep = 'last')[['PostalCode', target_variable]].sort_values('PostalCode')

In [53]:
postal_code.groupby(target_variable)['PostalCode'].agg(list)

GENERALI_price
246081.0    [2060, 2400, 3000, 3580, 6771, 7030, 7100, 715...
258700.0    [2660, 2700, 3521, 3526, 3527, 3529, 3600, 395...
268165.0    [2440, 2800, 3300, 3700, 3980, 4200, 4244, 770...
283939.0    [2840, 3533, 3770, 4002, 5400, 5430, 5540, 600...
299714.0    [2500, 2510, 2740, 2750, 2760, 2890, 3100, 403...
315488.0           [2117, 2120, 2200, 2225, 2230, 2721, 2730]
331262.0    [2030, 2040, 2045, 2119, 2143, 2151, 2220, 233...
340727.0    [1033, 1046, 1048, 1103, 1106, 1108, 1141, 114...
347037.0    [2013, 2051, 2083, 2092, 2100, 2112, 2131, 230...
362811.0    [1025, 1031, 1037, 1044, 1045, 1087, 1097, 110...
378586.0    [1026, 1062, 1089, 1118, 1121, 1124, 1125, 114...
410134.0                             [1028, 1038, 1039, 1135]
436830.0                                               [1011]
Name: PostalCode, dtype: object