In [1]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os, sys
# Add the 'scripts' directory to the Python path for module imports
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [4]:
# Import load_data function from scripts
from load_data import load_data

# read the dataset 

data = load_data('../data/MachineLearningRating_v3.zip', filename='MachineLearningRating_v3.txt')

In [5]:
# Explore the first few rows
data.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01 00:00:00,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [6]:
# Find the number of rows and columns
data.shape

(1000098, 52)

## Data Cleaning
Handle missing or incomplete data: Since the dataset spans several categories, some fields may have missing values.

In [7]:
# Import the Class to data processing

from data_processing import DataProcessing
# Create instance of the class
data_processing = DataProcessing(data)

# Summary of Missing data
missing_summary = data_processing.missing_data_summary()

# Display results
missing_summary

Unnamed: 0,Missing Count,Percentage (%)
NumberOfVehiclesInFleet,1000098,100.0
CrossBorder,999400,99.930207
CustomValueEstimate,779642,77.95656
WrittenOff,641901,64.18381
Converted,641901,64.18381
Rebuilt,641901,64.18381
NewVehicle,153295,15.327998
Bank,145961,14.59467
AccountType,40232,4.022806
Gender,9536,0.953507


# Drop Columns with High Missing Data:

Columns with high missing values offer little analytical value.
Dropped Columns

NumberOfVehiclesInFleet (100% missing)
CrossBorder (~99.93%)
CustomValueEstimate (~77.96%)
Converted, Rebuilt, WrittenOff (~64.18%)

In [8]:
cols_to_drop = ['NumberOfVehiclesInFleet', 
                'CrossBorder', 
                'CustomValueEstimate', 
                'Converted', 'Rebuilt', 
                'WrittenOff']

# Drop these columns
data = data_processing.handle_missing_data('high', cols_to_drop)

## Impute Moderate Missing Data:

Imputation preserves useful information, using the mode for categorical and median for numerical columns.
Imputed Columns:

NewVehicle (~15.33%)
Bank (~14.59%)
AccountType (~4.02%)

In [9]:

# Impute or drop columns with moderate missing data
missing_cols = ['NewVehicle', 'Bank', 'AccountType']
data = data_processing.handle_missing_data('moderate', missing_cols)

# Overall Decision Summary:

High missing data: Dropped.

Moderate missing data: Imputed with mode (categorical) or median (numerical).

Low missing data: Imputed to avoid unnecessary data loss.

In [10]:
# Check duplicates
data.duplicated().sum()

np.int64(74)

## Summarize Key Statistics

### Descriptive statistics:
In the descriptive statistics calculate and examine the variability for numerical features such as TotalPremium, TotalClaim, etc.

In [None]:
# Statistic summary of numerical features
num_cols = ['SumInsured', 'CalculatedPremiumPerTerm', 'TotalPremium', 'TotalClaims']
display(data[num_cols].describe())