# La Mobilière Insurance Data - Final data checks pre-uploading
Last modified by AB
on the 12/06/2021

This notebook performs final checks on aggregated La Mobilière data ready for upload & sharing.

We performed the following checks:

- All datasets present the same columns
- Identical number of municipalities in all datasets
- Data type as reported in paper
- Average features always fall within corresponding confidence interval (when reported)
- Pct are monotonically increasing 
- 0 customers implies NA for all other features (except census data) in municipality dataset
- Check number of variables with no variation (max 5 per year- car classes and types of builds variables)
- Check on sum of car class and type of building

In [1]:
import pandas as pd
import numpy as np
from asteval import Interpreter

### 1) All datasets presents the same columns

In [2]:
#Municipality-level dataset:
years=range(2010, 2019, 1)

count=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    col_new=dfAggregated.columns
    if ind==0:
        col=col_new
    if (ind>0) &  (len(list(set(list(col)) - set(list(col_new))))>0):
        print('Error (mun)! Different set of columns between '+str(years[ind-1])+' and '+str(years[ind]))
        break
    else:
        count=count+1
    col=col_new

if count==len(years):    
    print('All correct!')

All correct!


In [3]:
#ZIP-code-level dataset:
years=range(2008, 2020, 1)

count=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/aggregatedData/ZIP_%d.csv'%year)
    col_new=dfAggregated.columns
    if ind==0:
        col=col_new
    if (ind>0) &  (len(list(set(list(col)) - set(list(col_new))))>0):
        print('Error (zip)! Different set of columns between '+str(years[ind-1])+' and '+str(years[ind]))
        break
    else:
        count=count+1
    col=col_new
    
if count==len(years):    
    print('All correct!')

All correct!


### 2) Identical number of municipalities

In [4]:
#Municipality-level dataset:
years=range(2010, 2019, 1)

count=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    row_new=len(dfAggregated)
    if ind==0:
        row=row_new
    if (ind>0) &  (abs(row-row_new)>0):
        print('Error (mun)! Different set of columns between '+str(years[ind-1])+' and '+str(years[ind]))
        break
    else:
        count=count+1
    row=row_new

if count==len(years):    
    print('All correct!')

All correct!


### 3) Data type as reported in paper

In [5]:
#Dictionary of expected data type
data_type={'unemp':'float64', 'age_mean':'float64', 'frac_own':'float64', 'frac_foreign':'float64', 'child_mean':'float64',
          'custom':'int', 'frac_women':'float64', 'car1_custom_frac':'float64', 'car1_pr_mean':'float64', 'car1_y_pct50':'float64',
          'car1_ccm_mean':'float64', 'car1_claim_mean':'float64', 'car1_sumcl_mean':'float64', 'car1_prem_mean':'float64',
          'build_custom_frac':'float64', 'cl_furn_pct50':'float64', 'rooms_mean':'float64', 'build_ins_mean':'float64', 
          'build_y_pct50':'float64', 'build_claim_mean':'float64', 'build_sumcl_mean':'float64', 'build_prem_mean':'float64',
          'BFS':'int', 'ZIP':'int', 'pop_census':'int', 'age_0_19_census': 'float64',
          'age_20_64_census':'float64', 'age_65+_census':'float64', 
           'frac_foreign_census':'float64'}

In [6]:
#Municipality-level dataset:
years=range(2010, 2019, 1)

count=0
n=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    for i in data_type.keys():
        if i in dfAggregated.columns:
            n=n+1
            if dfAggregated[i].dtype!=data_type[i]:
                print(dfAggregated[i].dtype)
                print('Error (mun)! Incorrect data type for '+ i + ' in year: '+str(year))
                break
            else:
                count=count+1

if count==n:    
    print('All correct!')

All correct!


In [7]:
#ZIP-code-level dataset:
years=range(2008, 2020, 1)

count=0
n=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/aggregatedData/ZIP_%d.csv'%year)
    for i in data_type.keys():
        if i in dfAggregated.columns:
            n=n+1
            if dfAggregated[i].dtype!=data_type[i]:
                print(dfAggregated[i].dtype)
                print('Error (mun)! Incorrect data type for '+ i + ' in year: '+str(year))
                break
            else:
                count=count+1


if count==n:    
    print('All correct!')

All correct!


### 4) Average features always fall within corresponding confidence interval (when reported)

In [8]:
#Dictionary of expected data type
var_for_check=['age', 'child', 'car1_pr', 'car1_ccm', 'car1_claim', 'car1_sumcl', 'car1_prem', 'rooms', 'build_ins', 'build_claim', 'build_sumcl','build_prem']

In [9]:
#Municipality-level dataset:
years=range(2010, 2019, 1)

aeval = Interpreter()

count=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    for i in var_for_check:
        dfAggregated=dfAggregated[dfAggregated['custom']!=0]
        dfAggregated['check']=dfAggregated.apply(lambda x: 0 if  (x[i+'_mean']>=aeval(x[i+'_ci95'])[0]) & (x[i+'_mean']<=aeval(x[i+'_ci95'])[1]) else 1, axis=1)
        if dfAggregated['check'].sum()>0:
            print('Error (mun)! Mean doesn t fall within confidence interval for var '+ i + ' in year: '+str(year))
            break
        else:
            count=count+1

if count==len(years)*len(var_for_check):    
    print('All correct!')

All correct!


In [10]:
#ZIP-code-level dataset:
years=range(2008, 2020, 1)
aeval = Interpreter()

count=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/aggregatedData/ZIP_%d.csv'%year)
    for i in var_for_check:
        dfAggregated=dfAggregated[dfAggregated['custom']!=0]
        dfAggregated['check']=dfAggregated.apply(lambda x: 0 if  (x[i+'_mean']>=aeval(x[i+'_ci95'])[0]) & (x[i+'_mean']<=aeval(x[i+'_ci95'])[1]) else 1, axis=1)
        if dfAggregated['check'].sum()>0:
            print('Error (zip)! Mean doesn t fall within confidence interval for var '+ i + ' in year: '+str(year))
            break
        else:
            count=count+1

if count==len(years)*len(var_for_check):    
    print('All correct!')

All correct!


### 5) Pct monotonically increasing

In [11]:
#Dictionary of expected data type
pct_var=['age', 'child', 'car1_pr', 'car1_ccm', 'car1_claim', 'car1_sumcl', 'car1_y', 'car1_prem', 'rooms', 'build_ins', 'build_claim', 'build_y','build_sumcl','build_prem']

In [12]:
#Municipality-level dataset:
years=range(2010, 2019, 1)
q=['_pct05', '_pct25', '_pct50', '_pct75', '_pct95']

count=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    dfAggregated=dfAggregated[dfAggregated['custom']!=0]
    for i in var_for_check:
        for indj,j in enumerate(q):
            if indj>0:
                dfAggregated=dfAggregated[dfAggregated['custom']!=0]
                dfAggregated['check']=dfAggregated.apply(lambda x: 0 if x[i+j]>=x[i+q[indj-1]] else 1, axis=1)
                if dfAggregated['check'].sum()>0:
                    print('Error (mun)! Pct not monotonic for var '+ i + ' in year: '+str(year))
                    break
                else:
                    count=count+1

if count==len(years)*len(var_for_check)*(len(q)-1):     
    print('All correct!')

All correct!


In [13]:
#ZIP-code-level dataset:
years=range(2008, 2020, 1)
q=['_pct05', '_pct25', '_pct50', '_pct75', '_pct95']

count=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/aggregatedData/ZIP_%d.csv'%year)
    dfAggregated=dfAggregated[dfAggregated['custom']!=0]
    for i in var_for_check:
        for indj,j in enumerate(q):
            if indj>0:
                dfAggregated=dfAggregated[dfAggregated['custom']!=0]
                dfAggregated['check']=dfAggregated.apply(lambda x: 0 if x[i+j]>=x[i+q[indj-1]] else 1, axis=1)
                if dfAggregated['check'].sum()>0:
                    print('Error (zip)! Pct not monotonic for var '+ i + ' in year: '+str(year))
                    break
                else:
                    count=count+1

if count==len(years)*len(var_for_check)*(len(q)-1):    
    print('All correct!')

All correct!


### 6) 0 customers implies NA for all other features (except census data)

In [14]:
#Municipality-level dataset:
years=range(2010, 2019, 1)

count=0
n=0
for ind,year in enumerate(years):
    n=n+1
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    dfAggregated=dfAggregated[dfAggregated['custom']==0]
    dfAggregated=dfAggregated.drop(columns=['custom', 'BFS', 'municipality','pop_census','pop_d_census', 'age_0_19_census',
          'age_20_64_census', 'age_65+_census',  'frac_foreign_census'])
    if sum(dfAggregated.isna().sum())==np.shape(dfAggregated)[0]*np.shape(dfAggregated)[1]:
        count=count+1
    else:            
        print('Error (mun)! 0 custom but non-na feature values in year: '+str(year))
        break


if count==n:    
    print('All correct!')

All correct!


### 7) Check number of variables with no variation (max 5 per years- perc variables)

In [15]:
#Municipality-level dataset:
years=range(2010, 2019, 1)

count=0
for ind,year in enumerate(years):
    check=0
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    dfAggregated=dfAggregated[dfAggregated['custom']!=0]
    for i in dfAggregated.columns:
        if  dfAggregated[i].min()==dfAggregated[i].max():
            check=check+1
    if check>5:
        print('Error (mun)! No variation in the data for year '+str(year))
        break
    else:
        count=count+1

if count==len(years):     
    print('All correct!')

All correct!


In [16]:
#Zip-code-level dataset:
years=range(2008, 2020, 1)

count=0

for ind,year in enumerate(years):
    check=0
    dfAggregated = pd.read_csv('../Data/aggregatedData/ZIP_%d.csv'%year)
    dfAggregated=dfAggregated[dfAggregated['custom']!=0]
    for i in dfAggregated.columns:
        if  dfAggregated[i].min()==dfAggregated[i].max():
            check=check+1
    if check>5:
        print('Error (mun)! No variation in the data for year '+str(year))
        break
    else:
        count=count+1

if count==len(years):     
    print('All correct!')

All correct!


### 8) Check on sum of car classes and types of buildings

In [17]:
cars=[ 'MKL', 'KWA','VAN', 'OMK', 'UMK', 'MIC', 'CPE', 'SUV', 'LKL', 'CAB', 'ATV', 'SMA', 'ROL', 'CHO', 'GMA']
cars=['car1_' + s + '_frac' for s in cars]
builds=['DH','RCB','M3less','M3', 'Cond', 'HHI','AB','SB', 'CB', 'P', 'School','Sport','Manor', 'Public', 'RB']
builds=['build_' + s + '_frac' for s in builds]

In [18]:
#Municipality-level dataset:
years=range(2010, 2019, 1)

count=0
check=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/combinedData/municipality_combinedData_%d.csv'%year)
    dfAggregated=dfAggregated[dfAggregated['custom']!=0]
    dfAggregated_car=dfAggregated[cars].sum(axis=1).sum()
    dfAggregated_build=dfAggregated[cars].sum(axis=1).sum()
    if abs(dfAggregated_car-len(dfAggregated))>0.001:
        print('Error (mun)! cars do not sum to 100 in year '+str(year))
        break
    else:
        count=count+1
        
    if abs(dfAggregated_build-len(dfAggregated))>0.001:
        print('Error (mun)! build do not sum to 100 in year '+str(year))
        break
    else:
        count=count+1

if count==len(years)*2:     
    print('All correct!')

All correct!


In [19]:
#Zip-code-level dataset:
years=range(2008, 2020, 1)

count=0
check=0
for ind,year in enumerate(years):
    dfAggregated = pd.read_csv('../Data/aggregatedData/ZIP_%d.csv'%year)
    dfAggregated=dfAggregated[dfAggregated['custom']!=0]
    dfAggregated_car=dfAggregated[cars].sum(axis=1).sum()
    dfAggregated_build=dfAggregated[cars].sum(axis=1).sum()
    if abs(dfAggregated_car-len(dfAggregated))>0.01:
        print('Error (mun)! cars do not sum to 100 in year '+str(year))
        break
    else:
        count=count+1
        
    if abs(dfAggregated_build-len(dfAggregated))>0.01:
        print('Error (zip)! build do not sum to 100 in year '+str(year))
        break
    else:
        count=count+1

if count==len(years)*2:     
    print('All correct!')

All correct!
