In [12]:
import polars as pl
import plotly.express as px
import pandas as pd
from polars_visualization import pl_config as config
from freqSevEda import FreqSevEDA
import plotly.io as pio
import json

# Data overview

- We have 1580 entries. Grouping by Loc ID and Año Póliza we saw that each policy is annual.

In [13]:
#path = r"C:\Users\abrah\OneDrive\Desktop\casoCAS\archivo\data\final-dataset.xlsx"
path2 = r"C:\Users\abrah\OneDrive\Desktop\casoCAS\archivo\data\final-dataset_V.5.5.xlsx"
df = pl.read_excel(path2, sheet_name="Sheet1")

with config():
    print(df)

shape: (1_580, 21)
| Loc ID | Año Póliza | Exposicion | Suma Asegurada | Prima   | Evento ID | Numero Siniestros | Mes  | Duración de la inundación (día… | Severidad de la inundación (es… | Precipitación (mm) | Incremento del Nivel del Río (… | Monto de siniestro | Latitud | Longitud | Ciudad        | Pais         | Codigo pais | Continente | Sub continente     | Continente intermedio |
|--------|------------|------------|----------------|---------|-----------|-------------------|------|---------------------------------|---------------------------------|--------------------|---------------------------------|--------------------|---------|----------|---------------|--------------|-------------|------------|--------------------|-----------------------|
| loc001 | 2,014      | 1          | 87,223,100     | 659,145 | null      | 0                 | null | null                            | null                            | null               | null                            | 0            

In [4]:
eda = FreqSevEDA(df, exposure="Exposicion", claimNb="Numero Siniestros", claimAmount="Monto de siniestro")
eda.interactive_graph()

Dropdown(description='Columns:', options=('-', 'Loc ID', 'Año Póliza', 'Exposicion', 'Suma Asegurada', 'Prima'…

Output()

In [10]:
eda.graphFreqSev("Ciudad").update_layout(template = "plotly_white").write_html(r"C:\Users\abrah\OneDrive\Desktop\casoCAS\docs\images\fregSevCity.html")

# Description by columns

- We have negative values for TIV (Is this a mistake?) how should we deal with those values? We have only one location with negative TIV loc068. In this case we will take absolute value of the TIV column, in that way we don't lose information.
- Information between years 2014 and 2023
- Is premium in USD dolars?
- Duration floods between 4 and 10 days
- Severity value between 2 and 5. Why we don't have value 1?
- Precipitation between 100 mm and 310 mm
- Raising of river between 1.2 M and 4.5 M

In [4]:
with config():
    print(df.describe())

#lets fix TIV values 

df = df.with_columns(pl.col("Suma Asegurada").abs())


shape: (9, 22)
| statistic  | Loc ID | Año Póliza | Exposicion | Suma Asegurada    | Prima          | Evento ID | Numero Siniestros | Mes       | Duración de la inundación (día… | Severidad de la inundación (es… | Precipitación (mm) | Incremento del Nivel del Río (… | Monto de siniestro | Latitud   | Longitud  | Ciudad | Pais      | Codigo pais | Continente | Sub continente            | Continente intermedio |
|------------|--------|------------|------------|-------------------|----------------|-----------|-------------------|-----------|---------------------------------|---------------------------------|--------------------|---------------------------------|--------------------|-----------|-----------|--------|-----------|-------------|------------|---------------------------|-----------------------|
| count      | 1580   | 1,580.000  | 1,580.000  | 1,580.000         | 1,580.000      | 188.000   | 1,580.000         | 188.000   | 188.000                         | 188.000               

# Information against Año Póliza

- Precipitación Promedio per years tends to increase

In [5]:
w = df.filter()

TypeError: at least one predicate or constraint must be provided

In [49]:
summary = (df.group_by(["Ciudad"]).agg(pl.col("Exposicion").sum().alias("Exposicion"),
                              pl.col("Suma Asegurada").sum(),
                              pl.col("Prima").sum().alias("Prima Ganada"),
                              pl.col("Evento ID").drop_nulls().n_unique().alias("Número Eventos"),
                              pl.col("Numero Siniestros").sum(),
                              pl.col("Duración de la inundación (días)").mean().alias("Duración Promedio"),
                              pl.col("Severidad de la inundación (escala 1-5)").mean().alias("Magnitud Inundaciones Promedio"),
                              pl.col("Precipitación (mm)").mean().alias("Precipitación Promedio"),
                              pl.col("Incremento del Nivel del Río (m)").mean().alias("Incremento Promedio"),
                              pl.col("Monto de siniestro").sum().alias("Incurrido"))
                         .with_columns((pl.col("Incurrido")/pl.col("Prima Ganada")).alias("Indice de Siniestralidad"),
                                       (pl.col("Prima Ganada")/pl.col("Suma Asegurada")*1000).alias("Tasa por Mil"),
                                       (pl.col("Incurrido")/pl.col("Suma Asegurada")).alias("TPR"),
                                       (pl.col("Numero Siniestros")/pl.col("Exposicion")).alias("Frecuencia"),
                                       (pl.col("Incurrido")/pl.col("Numero Siniestros")).alias("Severidad"))
                         .with_columns((pl.col("Frecuencia")*pl.col("Severidad")).alias("Prima Pura"))
                         .with_columns((pl.col("Prima Pura")/pl.col("Prima Ganada")).alias("proportion"))
                         .with_columns((pl.col("Suma Asegurada")/pl.col("Exposicion")).alias("Suma Asegurada Promedio"))
                         .with_columns((pl.col("Prima Pura")/pl.col("Suma Asegurada")*1000).alias("Tasa por mil inundación"))
                         .sort(by = "Indice de Siniestralidad", descending=False))

summary

Ciudad,Exposicion,Suma Asegurada,Prima Ganada,Número Eventos,Numero Siniestros,Duración Promedio,Magnitud Inundaciones Promedio,Precipitación Promedio,Incremento Promedio,Incurrido,Indice de Siniestralidad,Tasa por Mil,TPR,Frecuencia,Severidad,Prima Pura,proportion,Suma Asegurada Promedio,Tasa por mil inundación
str,i64,i64,i64,u32,i64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Maungakiekie-Tāmaki""",10,693707090,7754600,1,1,4.0,3.0,150.0,1.7,80000,0.010316,11.178493,0.000115,0.1,80000.0,8000.0,0.001032,6.9370709e7,0.011532
"""Auckland""",10,3909399100,41734924,1,1,4.0,3.0,150.0,1.7,470000,0.011262,10.675534,0.00012,0.1,470000.0,47000.0,0.001126,3.9093991e8,0.012022
"""Germiston""",10,1866759870,17050862,1,1,7.0,4.0,150.0,2.1,220000,0.012903,9.133934,0.000118,0.1,220000.0,22000.0,0.00129,1.86675987e8,0.011785
"""Johannesburgo""",10,1790664720,16195869,1,1,7.0,4.0,150.0,2.1,220000,0.013584,9.044613,0.000123,0.1,220000.0,22000.0,0.001358,1.79066472e8,0.012286
"""Wuhan""",20,17741220660,163686523,1,2,6.0,4.0,230.0,3.2,2480000,0.015151,9.226339,0.00014,0.1,1.24e6,124000.0,0.000758,8.87061033e8,0.006989
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""London""",20,2477804650,7901131,1,2,10.0,5.0,200.0,3.5,1470000,0.186049,3.188763,0.000593,0.1,735000.0,73500.0,0.009302,1.2389e8,0.029663
"""Miami""",10,5599894070,12335890,2,2,8.0,5.0,250.0,3.4,2300000,0.186448,2.202879,0.000411,0.2,1.15e6,230000.0,0.018645,5.59989407e8,0.041072
"""Houston""",20,3597449250,12761104,2,4,4.0,3.0,170.0,2.1,2400000,0.188072,3.547264,0.000667,0.2,600000.0,120000.0,0.009404,1.7987e8,0.033357
"""Nueva Delhi""",10,4697226150,10342047,1,1,10.0,5.0,270.0,4.0,2490000,0.240765,2.201735,0.00053,0.1,2.49e6,249000.0,0.024076,4.69722615e8,0.05301


# Distribution by country

In [41]:
json_path = r"C:\Users\abrah\OneDrive\Desktop\casoCAS\archivo\data\countries.geojson"
# Open and read the JSON file
with open(json_path) as file:
    countries = json.load(file)  # Parse the JSON data into a Python dictionary/list

In [42]:
pio.renderers
pio.renderers.default = "notebook_connected"

In [None]:
fig2 = px.choropleth_map(summary,
                        geojson=countries,
                        color="Suma Asegurada Promedio",
                        locations='Codigo pais',
                        featureidkey="properties.ISO_A3",
                        #center={"lat": 0, "lon": 0},
                        #map_style=#carto-voyager",
                        template="plotly_white",
                        color_continuous_scale = "YlOrRd", #PuRd
                        hover_name = "Pais",
                        zoom=0)


fig2.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, height = 300)
fig2.show()
fig2.write_html(r"C:\Users\abrah\OneDrive\Desktop\casoCAS\docs\images\sumaAseguradaPromedio2.html")

In [39]:
fig2 = px.choropleth_map(summary,
                        geojson=countries,
                        color="Suma Asegurada",
                        locations='Codigo pais',
                        featureidkey="properties.ISO_A3",
                        #center={"lat": 0, "lon": 0},
                        #map_style=#carto-voyager",
                        template="plotly_white",
                        color_continuous_scale = "YlOrRd", #PuRd
                        hover_name = "Pais",
                        zoom=0)

fig2.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, height = 300)
fig2.write_html(r"C:\Users\abrah\OneDrive\Desktop\casoCAS\docs\images\SumaAsegurada2023.html")

# Heat map indice de siniestralidad

In [7]:
f = summary.filter(pl.col("Numero Siniestros")!=0)

In [51]:
fig = px.density_heatmap(summary,
                         x = "Precipitación Promedio",
                         y="Incremento Promedio", text_auto = False, z = "Tasa por mil inundación",color_continuous_scale = "YlOrRd", template = "plotly_white")

fig.update_layout( coloraxis_colorbar_title_text = 'Tasa por mil inundación')

fig.write_html(r"C:\Users\abrah\OneDrive\Desktop\casoCAS\docs\images\tasapormilvsIncrementoPromedio.html")


In [11]:
#banding continuos variables

In [19]:
from sklearn.preprocessing import KBinsDiscretizer

def binding_dimension(df,dimension, n_bins):

    est = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="quantile")
    tiv = df.get_column(dimension).to_numpy()
    
    
    binding_dimension = pl.Series(
        list(est.fit_transform(tiv.reshape(-1, 1)).reshape(1, -1)[0])
    )
    df2 = df.with_columns(**{f"binding{dimension}":binding_dimension}).with_columns(pl.col(f"binding{dimension}").cast(pl.String))

    return df2, est


# df2, est = binding_tiv(df)


# with aux.pl_config(all_rows=True):

#     bins_edge = pl.DataFrame(est.bin_edges_[0])
#     print(bins_edge)


In [25]:
bins_edge = pl.DataFrame(est.bin_edges_[0])
print(bins_edge)

shape: (6, 1)
┌──────────────┐
│ column_0     │
│ ---          │
│ f64          │
╞══════════════╡
│ 4.9991e6     │
│ 4.8801934e7  │
│ 8.362968e7   │
│ 1.72392128e8 │
│ 3.44546678e8 │
│ 1.8210e9     │
└──────────────┘


In [20]:
df2, est =binding_dimension(df, "Suma Asegurada", n_bins = 5)
df2

Loc ID,Año Póliza,Exposicion,Suma Asegurada,Prima,Evento ID,Numero Siniestros,Mes,Duración de la inundación (días),Severidad de la inundación (escala 1-5),Precipitación (mm),Incremento del Nivel del Río (m),Monto de siniestro,Latitud,Longitud,Ciudad,Pais,Codigo pais,Continente,Sub continente,Continente intermedio,bindingSuma Asegurada
str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,f64,f64,str,str,str,str,str,str,str
"""loc001""",2014,1,87223100,659145,,0,,,,,,0,-33.9069,18.4163,"""Cape Town""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""2.0"""
"""loc001""",2015,1,89011170,672330,,0,,,,,,0,-33.9069,18.4163,"""Cape Town""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""2.0"""
"""loc001""",2016,1,90444250,685780,,0,,,,,,0,-33.9069,18.4163,"""Cape Town""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""2.0"""
"""loc001""",2017,1,92596820,699500,,0,,,,,,0,-33.9069,18.4163,"""Cape Town""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""2.0"""
"""loc001""",2018,1,94439500,713490,,0,,,,,,0,-33.9069,18.4163,"""Cape Town""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""2.0"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""loc200""",2019,1,56113940,566220,,0,,,,,,0,-25.8874,28.1283,"""Rooihuiskraal""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""1.0"""
"""loc200""",2020,1,57283920,577540,,0,,,,,,0,-25.8874,28.1283,"""Rooihuiskraal""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""1.0"""
"""loc200""",2021,1,58561350,589090,,0,,,,,,0,-25.8874,28.1283,"""Rooihuiskraal""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""1.0"""
"""loc200""",2022,1,59627170,600870,,0,,,,,,0,-25.8874,28.1283,"""Rooihuiskraal""","""South Africa""","""ZAF""","""Africa""","""Sub-Saharan Africa""","""Southern Africa""","""1.0"""


In [24]:
graph = px.scatter(
    df2,
    y="Prima",
    x="Ciudad",
    color="bindingSuma Asegurada",
    template="plotly_white",
)
graph.show()