In [1]:
import datetime
import os
import sys
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split
import dataframe_image as dfi
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

notebook_dir = os.path.abspath(os.getcwd())
src_path = os.path.join('/home/lerceg/LukaPosao/market_models/', 'src')
sys.path.append(src_path)

import market_models.src.models.train_model as tm
import market_models.src.models.predict_model as pm
import market_models.src.utils as utils

%load_ext autoreload
%autoreload 
import market_models.src.features.build_features as build_features


In [2]:
data_name = 'netrisk_casco_v7'
#target_variable = 'ALFA_price'
target_variable = 'GROUPAMA_price'


data_path = utils.get_processed_data_path(data_name)
features_path = utils.get_features_path(data_name)



In [3]:
feature = 'PostalCode'
data, features = utils.load_data(data_path, features_path, target_variable)
brackets = data.drop_duplicates([feature], keep='last')[[feature, target_variable]].sort_values(by = feature)
brackets = brackets.groupby(target_variable)[feature].agg(list)
brackets = brackets.reset_index(drop=True).reset_index()
brackets = brackets.rename(columns = {'index' : f'{target_variable}_{feature}_bracket'})
brackets = brackets.explode(feature)
    

   Unnamed: 0 DateCrawled  isRecent  CarMake  CarAge   ccm   kw    kg  \
0           0  2024_01_02      True       27       7  3604  209  1828   
1           1  2024_01_02      True       28       2   998   49   899   
2           2  2024_01_02      True       66       1  1197   85  1390   
3           3  2024_01_02      True       26       2  1997  184  1611   
4           4  2024_01_02      True        5       2  1499   90  1545   

   car_value  CarMakerCategory  ...  GENERALI_price  GENERTEL_price  \
0    38922.0              1.10  ...        331409.0             NaN   
1    15337.4              1.00  ...        120684.0             NaN   
2    25532.0              0.97  ...        299714.0             NaN   
3    44005.0              1.10  ...        719447.0             NaN   
4    30498.0              1.00  ...        329262.0             NaN   

   GROUPAMA_price  K&AMP;H_price  KÖBE_price  MAGYAR_price  SIGNAL_price  \
0             NaN       532519.0         NaN           NaN

In [24]:
brackets

Unnamed: 0,GROUPAMA_price_PostalCode_bracket,PostalCode
0,0,3170
0,0,5400
1,1,5122
2,2,3599
3,3,9154
...,...,...
19,19,1212
19,19,1213
19,19,1214
19,19,1215


In [3]:


def make_brackets(data, feature, target_variable):
    brackets = data.drop_duplicates([feature], keep='last')[[feature, target_variable]].sort_values(feature)
    brackets = brackets.groupby(target_variable)[feature].agg(list)
    brackets = brackets.reset_index(drop=True).reset_index()
    brackets = brackets.rename(columns = {'index' : f'{target_variable}_{feature}_bracket'})
    brackets = brackets.explode(feature)
    return brackets

def collapse_list(lst):
    return (min(lst), max(lst))

def merge_overlapping_tuples(tuple1, tuple2):
    start1, end1 = tuple1
    start2, end2 = tuple2

    # Check for overlap
    if start1 <= end2 and start2 <= end1:
        # Merge overlapping ranges
        merged_start = min(start1, start2)
        merged_end = max(end1, end2)
        return merged_start, merged_end
    else:
        # No overlap
        return None

    

In [25]:
target_variables = ['ALFA_price', 'ALLIANZ_price', 'GENERALI_price', 'GENERTEL_price', 'GROUPAMA_price', 'K&AMP;H_price', 'KÖBE_price', 'MAGYAR_price', 'SIGNAL_price', 'UNION_price', 'UNIQA_price', 'WÁBERER_price']
for target_variable in target_variables:
    
    data, features = utils.load_data(data_path, features_path, target_variable)
    data_old = data[data['DateCrawled'] != '2024_01_05']
    
    for feature in ['Age', 'PostalCode']:
        brackets = make_brackets(data_old, target_variable, feature)
        brackets_dir = utils.get_feature_brackets_dir(target_variable)
        bracket_path = utils.get_brackets_path(target_variable, feature)
        print(bracket_path)

        brackets.to_csv(bracket_path, index = False)

../data/external/feature_brackets/ALFA_price_brackets/ALFA_price_Age_brackets.csv
../data/external/feature_brackets/ALFA_price_brackets/ALFA_price_PostalCode_brackets.csv
../data/external/feature_brackets/ALLIANZ_price_brackets/ALLIANZ_price_Age_brackets.csv
../data/external/feature_brackets/ALLIANZ_price_brackets/ALLIANZ_price_PostalCode_brackets.csv
../data/external/feature_brackets/GENERALI_price_brackets/GENERALI_price_Age_brackets.csv
../data/external/feature_brackets/GENERALI_price_brackets/GENERALI_price_PostalCode_brackets.csv
../data/external/feature_brackets/GENERTEL_price_brackets/GENERTEL_price_Age_brackets.csv
../data/external/feature_brackets/GENERTEL_price_brackets/GENERTEL_price_PostalCode_brackets.csv
../data/external/feature_brackets/GROUPAMA_price_brackets/GROUPAMA_price_Age_brackets.csv
../data/external/feature_brackets/GROUPAMA_price_brackets/GROUPAMA_price_PostalCode_brackets.csv
../data/external/feature_brackets/K&AMP;H_price_brackets/K&AMP;H_price_Age_brackets.c

In [21]:
brackets = make_brackets(data_new, 'CarAge', target_variable)

In [22]:
brackets

Unnamed: 0,GROUPAMA_price_CarAge_bracket,CarAge
0,0,9
1,1,7
1,1,8
2,2,6
3,3,5
4,4,4
5,5,2
5,5,3
6,6,1


In [9]:
make_brackets(data_new, 'PostalCode', target_variable)

UNION_price
63136.0    [2016, 2021, 2023, 2038, 2045, 2060, 2065, 206...
66850.0    [3400, 3418, 3432, 3433, 3434, 3450, 3458, 350...
74277.0    [1011, 1012, 1013, 1014, 1015, 1016, 1021, 102...
Name: PostalCode, dtype: object

In [31]:
make_brackets(data_new, 'Age', target_variable).apply(collapse_list)

UNION_price
63136.0     (35, 69)
69449.0     (31, 33)
72606.0     (71, 77)
94704.0     (23, 29)
157840.0    (19, 21)
Name: Age, dtype: object

In [32]:
make_brackets(data_old, 'Age', target_variable).apply(collapse_list)

Series([], Name: Age, dtype: int64)

In [33]:
import pandas as pd

# Sample data as a pandas Series
ranges = pd.concat([make_brackets(data_old, 'Age', target_variable), make_brackets(data_new, 'Age', target_variable)]).apply(collapse_list)

# Sort the series by the start value of each tuple
sorted_data = ranges.sort_values()

# Initialize variables to store merged ranges
merged_ranges = []

# Iterate through the tuples and merge overlapping ranges
current_start, current_end = sorted_data.iloc[0]

for index, value in sorted_data.iteritems():
    if value[0] <= current_end:
        # Merge overlapping ranges
        current_end = max(current_end, value[1])
    else:
        # Save the merged range and update current start and end
        merged_ranges.append((current_start, current_end))
        current_start, current_end = value

# Save the last merged range
merged_ranges.append((current_start, current_end))

# Create a new Series with merged ranges
result_series = pd.Series(merged_ranges, name='merged_ranges')

print(result_series)


0    (19, 21)
1    (23, 29)
2    (31, 33)
3    (35, 69)
4    (71, 77)
Name: merged_ranges, dtype: object


  for index, value in sorted_data.iteritems():


pandas.core.series.Series

In [43]:
data = pd.read_csv(utils.get_processed_data_path('netrisk_casco_v1'))

In [46]:
price_cols = data.filter(like = '_price').columns.to_list()
insurers = [x.replace('_price', '') for x in price_cols]

def add_rank(row):
    prices = list(zip(insurers, row[price_cols]))
    prices = sorted(prices, key=lambda x: float('inf') if str(x[1]) == 'nan' else x[1])
    ranks = []
    for i, (ins, pr) in enumerate(prices):
        if str(pr) != 'nan':
            ranks.append((ins, i + 1))
        else:
            ranks.append((ins, None))
    ranks = sorted(ranks, key=lambda x: x[0])
    return pd.Series([x[1] for x in ranks])

rank_cols = [ins + '_rank' for ins in insurers]
data[rank_cols] = data.apply(lambda x: add_rank(x), axis=1)


In [47]:
data[rank_cols]

Unnamed: 0,ALFA_rank,ALLIANZ_rank,GENERALI_rank,GENERTEL_rank,GROUPAMA_rank,K&AMP;H_rank,KÖBE_rank,MAGYAR_rank,SIGNAL_rank,UNION_rank,UNIQA_rank,WÁBERER_rank
0,6.0,9.0,5.0,,3.0,8.0,2.0,,,1.0,4.0,7.0
1,3.0,8.0,10.0,,2.0,4.0,5.0,6.0,11.0,1.0,9.0,7.0
2,7.0,9.0,10.0,,6.0,8.0,3.0,2.0,11.0,5.0,1.0,4.0
3,6.0,9.0,4.0,,11.0,8.0,3.0,2.0,10.0,1.0,7.0,5.0
4,6.0,10.0,7.0,,3.0,9.0,1.0,2.0,8.0,5.0,11.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5190,7.0,4.0,5.0,,3.0,9.0,1.0,,,2.0,6.0,8.0
5191,3.0,8.0,10.0,,11.0,6.0,2.0,9.0,4.0,1.0,7.0,5.0
5192,3.0,9.0,5.0,,7.0,8.0,1.0,4.0,,2.0,6.0,
5193,6.0,3.0,9.0,,4.0,8.0,1.0,,5.0,2.0,7.0,


In [48]:
def rank_analysis(profiles):
    rank_value_counts = profiles[rank_cols].apply(pd.Series.value_counts)
    rank_percentage = rank_value_counts.apply(lambda x : x / x.sum(), axis = 1) * 100
    return rank_value_counts, rank_percentage

In [49]:
rank_value_counts, rank_percentage = rank_analysis(data)

In [63]:
rank_percentage.iloc[0 : 1]

Unnamed: 0,ALFA_rank,ALLIANZ_rank,GENERALI_rank,GENERTEL_rank,GROUPAMA_rank,K&AMP;H_rank,KÖBE_rank,MAGYAR_rank,SIGNAL_rank,UNION_rank,UNIQA_rank,WÁBERER_rank
1.0,5.928778,0.404235,1.732435,,11.241578,0.250241,47.661213,6.12127,0.26949,16.053898,3.291627,7.045236


In [55]:
8000 / data['UNION_price'].mean() * 100

6.075722384836013