In [5]:
import os
import sys
# Add the scripts folder to the Python path
sys.path.append(os.path.abspath("../scripts"))

In [6]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Import our custom modules
from statistical_model import InsuranceModel, prepare_insurance_data, evaluate_model
from insurance_model import initialize_model
from data_preparation import prepare_data, split_features_target

In [7]:
# Load the data
try:
    data = pd.read_csv('data/cleaned_data.csv')
    print("Data loaded successfully.")
    print(data.head())
except FileNotFoundError:
    print("Error: File not found. Please ensure 'cleaned_data.csv' exists in the 'data' directory.")
except Exception as e:
    print(f"Error while loading data: {e}")
    raise



Data loaded successfully.
   UnderwrittenCoverID  PolicyID TransactionMonth  IsVATRegistered  \
0               145249     12827       2015-03-01             True   
1               145249     12827       2015-05-01             True   
2               145249     12827       2015-07-01             True   
3               145255     12827       2015-05-01             True   
4               145255     12827       2015-07-01             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...                    ExcessSelected CoverCategory  \
0  Current account  ...             Mobility 

In [8]:
# Define the target variable and features
target_variable = "TotalClaims"  # Replace if needed
feature_columns = [col for col in data.columns if col != target_variable]

# Print feature and target details
print(f"Target Variable: {target_variable}")
print(f"Number of Feature Columns: {len(feature_columns)}")
print(f"Feature Columns: {feature_columns}")


Target Variable: TotalClaims
Number of Feature Columns: 51
Feature Columns: ['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth', 'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language', 'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province', 'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode', 'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders', 'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors', 'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder', 'NumberOfVehiclesInFleet', 'SumInsured', 'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium']


In [9]:
# Identify numeric and categorical features
numeric_features = data.select_dtypes(include=np.number).columns.tolist()
numeric_features.remove('TotalClaims')  # Exclude the target variable

categorical_features = [col for col in feature_columns if col not in numeric_features]

# Prepare the data
try:
    X_processed, y, transformers, feature_names = prepare_data(
        data=data,
        numeric_features=numeric_features,
        categorical_features=categorical_features,
        target_variable=target_variable
    )
    print("Data preparation completed successfully.")
    print(f"Processed Features Shape: {X_processed.shape}")
    print(f"Target Shape: {y.shape}")
    print(f"Feature Names: {feature_names}")
except Exception as e:
    print(f"Error during data preparation: {e}")
    raise

INFO:root:Starting data preparation...
INFO:root:Filling missing values in numeric features using the mean.
INFO:root:Filling missing values in categorical features using the mode.
INFO:root:Applying transformations to numeric and categorical features.
INFO:root:Data preparation complete.


Data preparation completed successfully.
Processed Features Shape: (1000098, 897)
Target Shape: (1000098,)
Feature Names: ['num__UnderwrittenCoverID', 'num__PolicyID', 'num__PostalCode', 'num__mmcode', 'num__RegistrationYear', 'num__Cylinders', 'num__cubiccapacity', 'num__kilowatts', 'num__NumberOfDoors', 'num__CustomValueEstimate', 'num__CapitalOutstanding', 'num__SumInsured', 'num__CalculatedPremiumPerTerm', 'num__TotalPremium', 'cat__TransactionMonth_2013-10-01', 'cat__TransactionMonth_2013-11-01', 'cat__TransactionMonth_2013-12-01', 'cat__TransactionMonth_2014-01-01', 'cat__TransactionMonth_2014-02-01', 'cat__TransactionMonth_2014-03-01', 'cat__TransactionMonth_2014-04-01', 'cat__TransactionMonth_2014-05-01', 'cat__TransactionMonth_2014-06-01', 'cat__TransactionMonth_2014-07-01', 'cat__TransactionMonth_2014-08-01', 'cat__TransactionMonth_2014-09-01', 'cat__TransactionMonth_2014-10-01', 'cat__TransactionMonth_2014-11-01', 'cat__TransactionMonth_2014-12-01', 'cat__TransactionMonth_20

In [10]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y, test_size=0.2, random_state=42
    )
    print("Data splitting completed successfully.")
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Training Target Shape: {y_train.shape}")
    print(f"Testing Target Shape: {y_test.shape}")
except Exception as e:
    print(f"Error during data splitting: {e}")
    raise

# Split the data into training and testing sets
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_processed, y, test_size=0.2, random_state=42
    )
    print("Data splitting completed successfully.")
    print(f"Training Features Shape: {X_train.shape}")
    print(f"Testing Features Shape: {X_test.shape}")
    print(f"Training Target Shape: {y_train.shape}")
    print(f"Testing Target Shape: {y_test.shape}")
except Exception as e:
    print(f"Error during data splitting: {e}")
    raise


Data splitting completed successfully.
Training Features Shape: (800078, 897)
Testing Features Shape: (200020, 897)
Training Target Shape: (800078,)
Testing Target Shape: (200020,)
Data splitting completed successfully.
Training Features Shape: (800078, 897)
Testing Features Shape: (200020, 897)
Training Target Shape: (800078,)
Testing Target Shape: (200020,)


In [11]:

# Initialize the model
insurance_model = initialize_model(model_type='linear_regression')

# Train the model
insurance_model.train(X_train, y_train)
print("Model training completed successfully.")

# Make predictions
predictions = insurance_model.predict(X_test)
print("Predictions:", predictions)


Model training completed successfully.
Predictions: [-30.34851925  11.44561392  12.91551298 ...  -4.4342577   74.65527305
 -51.87070316]


In [12]:
# Get predictions from the trained model
y_pred = insurance_model.model.predict(X_test)
