# Data Proprocessing

In [33]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

import os, sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
# Set max rows and columns to display
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [34]:
# Read the dataset
df = pd.read_csv('../data/cleaned_data.csv', low_memory=False, index_col=False)

In [35]:
# Explore the first five rows
df.head(5)

Unnamed: 0.1,Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,MaritalStatus,Gender,Country,Province,PostalCode,MainCrestaZone,SubCrestaZone,ItemType,mmcode,VehicleType,RegistrationYear,make,Model,Cylinders,cubiccapacity,kilowatts,bodytype,NumberOfDoors,VehicleIntroDate,AlarmImmobiliser,TrackingDevice,CapitalOutstanding,NewVehicle,SumInsured,TermFrequency,CalculatedPremiumPerTerm,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,0.01,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,0.01,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,0.01,Monthly,25.0,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,119300.0,Monthly,220.1628,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,54.824561,0.0
4,4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,Not specified,Not specified,South Africa,Gauteng,1459,Rand East,Rand East,Mobility - Motor,44069150.0,Passenger Vehicle,2004,MERCEDES-BENZ,E 240,6.0,2597.0,130.0,S/D,4.0,,Yes,No,119300,More than 6 months,119300.0,Monthly,220.1628,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [36]:
# Convert 'CapitalOutstanding' to numeric, coerce errors to handle non-numeric values (e.g., empty strings or NaNs)
df['CapitalOutstanding'] = pd.to_numeric(df['CapitalOutstanding'], errors='coerce')

# Fill missing values if necessary, e.g., with 0 or the column's mean
df['CapitalOutstanding'] = df['CapitalOutstanding'].fillna(0)  # Or df['CapitalOutstanding'].fillna(df['CapitalOutstanding'].mean())

# Ensure it's a float type
df['CapitalOutstanding'] = df['CapitalOutstanding'].astype(float)

# Verify the conversion
print(df['CapitalOutstanding'].dtype)

float64


In [37]:
# Check missing values in each column
missing_counts = df.isnull().sum()

# Print columns with more than 1 missing value
columns_with_missing = missing_counts[missing_counts > 1].index
print(f'Columns with more than 1 missing value:\n{columns_with_missing}')

# Drop these columns
df = df.drop(columns=columns_with_missing)

# Verify the result
print(f'Columns remaining after dropping:\n{df.columns}')

Columns with more than 1 missing value:
Index(['MaritalStatus', 'Gender', 'mmcode', 'VehicleType', 'make', 'Model',
       'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate'],
      dtype='object')
Columns remaining after dropping:
Index(['Unnamed: 0', 'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'Country', 'Province', 'PostalCode',
       'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'RegistrationYear',
       'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding',
       'NewVehicle', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'TotalClaims'],
      dtype='object')


In [38]:
# Get categorical columns
categorical_columns = df.select_dtypes(include='object').columns
# Create a summary DataFrame
summary_df = pd.DataFrame({
    'Column': categorical_columns,
    'DataType': [df[col].dtype for col in categorical_columns],
    'NumUniqueValues': [df[col].nunique() for col in categorical_columns]
})

# Print the summary DataFrame
summary_df

Unnamed: 0,Column,DataType,NumUniqueValues
0,TransactionMonth,object,23
1,Citizenship,object,4
2,LegalType,object,6
3,Title,object,5
4,Language,object,1
5,Bank,object,11
6,AccountType,object,3
7,Country,object,1
8,Province,object,9
9,MainCrestaZone,object,16


Key Insights from the Summary:

Columns like Language, Country, ItemType, StatutoryClass, and StatutoryRiskType don't provide variability.

Drop these columns from the dataset.

Model (411 unique values) could add complexity.

Consider reducing cardinality by grouping or applying target encoding.

Columns like AlarmImmobiliser, TrackingDevice, and NewVehicle have only 2 unique values.

TransactionMonth should be converted to a date format.

Extract useful features like year, month, and quarter if necessary.

CapitalOutstanding should be numeric and can be used as a continuous feature.

In [39]:
columns_to_drop = ['Language', 'Country', 'ItemType', 'StatutoryClass', 'StatutoryRiskType']
df = df.drop(columns=columns_to_drop)

Convert the CapitalOutstanding to numeric format

In [40]:
df.columns

Index(['Unnamed: 0', 'UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Bank',
       'AccountType', 'Province', 'PostalCode', 'MainCrestaZone',
       'SubCrestaZone', 'RegistrationYear', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'SumInsured',
       'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected',
       'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product',
       'TotalPremium', 'TotalClaims'],
      dtype='object')

In [41]:
# Convert boolean 'IsVATRegistered' to integer
df['IsVATRegistered'] = df['IsVATRegistered'].astype(int)