In [5]:
%cd /content/drive/MyDrive/Tenx program/week-3/

/content/drive/MyDrive/Tenx program/week-3


In [6]:
# importing lib
import pandas as pd
from sklearn.model_selection import train_test_split
from scripts.data_loader import load_clean_data
from scripts.statistical_modeling import (
    preprocess_features, split_data,
    train_regression_models, evaluate_regression,
    train_classification_models, evaluate_classification,
    compute_shap_importance, compute_risk_based_premium
)

In [7]:
# loading clean data
df = load_clean_data('claims_clean.csv')
df.head()

Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,54.824561,0.0
4,145255,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


In [8]:
# Prepare Regression Dataset (only policies with claim)
df_reg = df[df.TotalClaims > 0].reset_index(drop=True)
X_reg, y_reg, _, pre_reg = preprocess_features(df_reg, target_reg="TotalClaims")

Xr_train, Xr_test, yr_train, yr_test = split_data(X_reg, y_reg)

print("Regression dataset prepared successfully.")

Regression dataset prepared successfully.


In [None]:
# Train & Evaluate Regression Models
reg_pipes = train_regression_models(Xr_train, yr_train, pre_reg)
reg_results = evaluate_regression(reg_pipes, Xr_test, yr_test)
display(reg_results)


In [None]:
# SHAP Feature Importance for Best Reg Model
best_reg = reg_results["RMSE"].idxmin()
best_pipe = reg_pipes[best_reg]

# Sample 500 rows from your test set (Xr_test is the untransformed DataFrame):
X_sample = Xr_test.sample(500, random_state=42)

# Compute SHAP importance:
shap_importance = compute_shap_importance(best_pipe, X_sample)
display(shap_importance.head(10))

In [None]:
# Prepare Classification Dataset (all policies)
X_clf, _, y_clf, pre_clf = preprocess_features(df, target_reg="TotalClaims", target_clf="has_claim")
Xc_train, Xc_test, yc_train, yc_test = split_data(X_clf, y_clf)


In [None]:
# Train & Evaluate Classification Models
clf_pipes = train_classification_models(Xc_train, yc_train, pre_clf)
clf_results = evaluate_classification(clf_pipes, Xc_test, yc_test)
display(clf_results)


In [None]:
# SHAP Feature Importance for Best Classifier
best_clf = clf_results["F1"].idxmax()
shap_clf = compute_shap_importance(clf_pipes[best_clf], Xc_test.sample(500, random_state=1))
display(shap_clf.head(10))


In [None]:
# Compute Risk‐Based Premium & Compare to CalculatedPremiumPerTerm
premium_est = compute_risk_based_premium(clf_pipes[best_clf], reg_pipes[best_reg], X_clf)
df_compare = pd.DataFrame({
    "Calculated": df.CalculatedPremiumPerTerm,
    "ModelBased": premium_est
})
df_compare["diff"] = df_compare.ModelBased - df_compare.Calculated
df_compare.describe()


In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold # Import VarianceThreshold

print("Attempting to import from scripts...")
# Import the data loading function (assuming it's correctly located)
from scripts.data_loader import load_clean_data


# Define the corrected preprocessing function within this cell
def preprocess_features_corrected(df: pd.DataFrame, target_reg: str, target_clf: str=None):
    """
    Splits into X, y_regression, y_classification (if requested),
    and builds a preprocessing pipeline. (Corrected version)
    """
    df = df.copy()
    # Define features & targets
    y_reg = df[target_reg]
    if target_clf:
        df["has_claim"] = (df["TotalClaims"] > 0).astype(int)
        y_clf = df["has_claim"]
    else:
        y_clf = None

    drop_cols = [target_reg]
    if target_clf:
        drop_cols.append("has_claim")
    X = df.drop(columns=drop_cols)

    num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object","bool","category"]).columns.tolist()

    # Build transformers
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("variance_threshold", VarianceThreshold()), # Add VarianceThreshold
        ("scaler",  StandardScaler())
    ])
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("onehot",  OneHotEncoder(handle_unknown="ignore"))
    ])
    preprocessor = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ], remainder='passthrough') # Keep other columns like date/IDs if they exist

    return X, y_reg, y_clf, preprocessor


# --- Load df and Create X_clf within the same cell ---

# loading clean data
df = load_clean_data('claims_clean.csv')

# Explicitly create X_clf to ensure it has the correct columns
# Use preprocess_features_corrected to get X_clf, ignoring the other outputs for now
X_clf, _, y_clf, pre_clf = preprocess_features_corrected(df, target_reg="TotalClaims", target_clf="has_claim")


print("df loaded and X_clf created successfully. Columns of X_clf:")
display(X_clf.columns) # Use display for better formatting

Attempting to import from scripts...
df loaded and X_clf created successfully. Columns of X_clf:


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'WrittenOff',
       'Rebuilt', 'Converted', 'SumInsured', 'TermFrequency',
       'CalculatedPremiumPerTerm', 'ExcessSelected', 'CoverCategory',
       'CoverType', 'CoverGroup', 'Section', 'Product', 'StatutoryClass',
       'StatutoryRiskType', 'TotalPremium'],
      dtype='object')

In [9]:
# Check contents of the current directory and the scripts directory
!ls .
!ls ./scripts

 claims_clean.csv			  MachineLearningRating_v3.txt
'Exploratory Data Analysis (EDA).ipynb'   predictive_modeling.ipynb
 hypothesis_testing.ipynb		  scripts
 intreim_report.pdf
data_cleaning.py  eda_plots.py		 __init__.py  statistical_modeling.py
data_loader.py	  hypothesis_testing.py  __pycache__  utils.py


In [11]:
# Prepare Regression Dataset (only policies with claim)
# Uses preprocess_features_corrected and split_data_corrected which should be defined
# from the previous cell (0337fd15)

df_reg = df[df.TotalClaims > 0].reset_index(drop=True)
X_reg, y_reg, _, pre_reg = preprocess_features_corrected(df_reg, target_reg="TotalClaims")

# Assuming split_data_corrected is also defined in cell 0337fd15 or a preceding cell
def split_data_corrected(X, y, test_size=0.3, random_state=42):
    from sklearn.model_selection import train_test_split
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

Xr_train, Xr_test, yr_train, yr_test = split_data_corrected(X_reg, y_reg)

print("Regression dataset prepared successfully.")

Regression dataset prepared successfully.


In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler # Removed QuantileTransformer import as it's not used in this version
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
# Removed TransformedTargetRegressor import as it's not used in this version


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, r2_score


def train_regression_models_final(X_train, y_train, preprocessor):
    """
    Trains various regression models wrapped in a pipeline with the given preprocessor.

    Args:
        X_train (pd.DataFrame): Training features.
        y_train (pd.Series): Training target variable.
        preprocessor (ColumnTransformer): Fitted preprocessor object.

    Returns:
        dict: A dictionary of trained pipelines, keyed by model name.
    """
    models = {
        "LinearRegression": LinearRegression(),
        "RandomForest":     RandomForestRegressor(n_estimators=100, random_state=42),
        "XGBoost":          XGBRegressor(
                                n_estimators=100,
                                random_state=42,
                                use_label_encoder=False, # Keep for compatibility if needed, though deprecated
                                eval_metric="rmse"
                            )
    }
    pipelines = {}
    for name, model in models.items():
        # Create a pipeline with preprocessing and the model
        pipe = Pipeline([("prep", preprocessor), ('model', model)])

        # Train the pipeline directly on the original target variable
        # Note: Target transformation (like log or quantile) was removed due to NaN issues.
        # This means models predict the raw TotalClaims values.
        pipe.fit(X_train, y_train)
        pipelines[name] = pipe # Store the trained pipeline
    return pipelines

def evaluate_regression_final(models: dict, X_test, y_test):
    """
    Evaluates trained regression models using RMSE and R².

    Args:
        models (dict): Dictionary of trained pipelines.
        X_test (pd.DataFrame): Test features.
        y_test (pd.Series): Test target variable.

    Returns:
        pd.DataFrame: DataFrame containing RMSE and R² for each model.
    """
    results = {}
    for name, pipe in models.items():
        # Make predictions on the test set
        # The pipeline handles preprocessing internally before prediction
        preds = pipe.predict(X_test)

        # Compute evaluation metrics
        # This will raise ValueError if y_test or preds contain NaNs
        results[name] = {
            "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
            "R2":   r2_score(y_test, preds)
        }
    return pd.DataFrame(results).T

# Train & Evaluate Regression Models using the final corrected functions
# Ensure Xr_train, yr_train, Xr_test, yr_test, pre_reg are defined from preceding cells
# Assuming Xr_train, yr_train, Xr_test, yr_test, pre_reg are defined from previous cells
reg_pipes = train_regression_models_final(Xr_train, yr_train, pre_reg)
reg_results = evaluate_regression_final(reg_pipes, Xr_test, yr_test)
display(reg_results)

# Define best_reg after evaluation
best_reg = reg_results["RMSE"].idxmin()

Parameters: { "use_label_encoder" } are not used.



Unnamed: 0,RMSE,R2
LinearRegression,18065.36755,0.359393
RandomForest,16761.848971,0.448505
XGBoost,18158.804066,0.35275


In [13]:
# SHAP Feature Importance for Best Reg Model
# This cell requires reg_pipes, best_reg, and Xr_test
import shap
import numpy as np
from sklearn.ensemble import RandomForestRegressor # Import the model class
import pandas as pd # Import pandas

# Ensure reg_pipes, best_reg, and Xr_test are defined from preceding cells

best_model_pipe = reg_pipes[best_reg]

# Extract the trained model from the pipeline
# Use the correct key 'model' to access the model step
best_model = best_model_pipe.named_steps['model']


# Preprocess the sampled data using the preprocessor from the pipeline
X_sample = Xr_test.sample(500, random_state=1)
# Access the preprocessor step from the pipeline
preprocessor_step = best_model_pipe.named_steps['prep']
X_sample_preprocessed = preprocessor_step.transform(X_sample)

# Convert preprocessed data to dense array if sparse, and ensure float type
if hasattr(X_sample_preprocessed, 'toarray'):
    X_sample_preprocessed = X_sample_preprocessed.toarray().astype(float)
else:
     X_sample_preprocessed = X_sample_preprocessed.astype(float)


# Use TreeExplainer with the extracted model and the preprocessed sampled data
# For tree models, SHAP can often work directly with the preprocessed data
shap_explainer = shap.TreeExplainer(best_model)
shap_values = shap_explainer.shap_values(X_sample_preprocessed)

# Compute mean absolute SHAP values for feature importance
# Ensure shap_values is in the correct format for mean(0) if it's a list of arrays
if isinstance(shap_values, list):
    # For multi-output models, average SHAP values across outputs if needed,
    # or select a specific output. Assuming single output for regression.
    shap_values = shap_values[0] # Adjust if it's a list for regression

# Get feature names after preprocessing
# This requires accessing the feature names from the ColumnTransformer's output
# This can be a bit tricky depending on the scikit-learn version and transformers used
# A common way is to get names from the one-hot encoder and original numeric columns
try:
    # For scikit-learn 1.0 and later
    feature_names_out = preprocessor_step.get_feature_names_out()
except AttributeError:
    # For older versions, we might need to construct names manually
    # This is a simplified approach and might need adjustment based on your preprocessor
    ohe_feature_names = preprocessor_step.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(X_sample[preprocessor_step.transformers_[1][2]].columns)
    numeric_features = preprocessor_step.transformers_[0][2]
    feature_names_out = np.concatenate([numeric_features, ohe_feature_names])


shap_importance = pd.DataFrame({
    'Feature': feature_names_out,
    'SHAP_Value': np.abs(shap_values).mean(0)
})


shap_reg = shap_importance.sort_values(by='SHAP_Value', ascending=False)

display(shap_reg.head(10))

Unnamed: 0,Feature,SHAP_Value
12,num__CalculatedPremiumPerTerm,7897.502729
11,num__SumInsured,7702.306672
2,num__PostalCode,679.202078
1,num__PolicyID,581.555958
0,num__UnderwrittenCoverID,500.0749
3,num__mmcode,419.16092
6,num__cubiccapacity,351.152794
10,num__CapitalOutstanding,325.547076
385,cat__ExcessSelected_No excess,289.691388
9,num__CustomValueEstimate,275.882023


In [14]:
# Prepare Classification Dataset (all policies)
# Uses X_clf, y_clf, pre_clf from cell 0337fd15 and split_data_corrected
# from cell f7ffa89d or 0337fd15

# Assuming split_data_corrected is defined in a preceding cell
# def split_data_corrected(X, y, test_size=0.3, random_state=42):
#     from sklearn.model_selection import train_test_split
#     return train_test_split(X, y, test_size=test_size, random_state=random_state)


Xc_train, Xc_test, yc_train, yc_test = split_data_corrected(X_clf, y_clf)

print("Classification dataset prepared successfully.")

Classification dataset prepared successfully.


In [15]:
# Train & Evaluate Classification Models
# This cell requires Xc_train, yc_train, pre_clf, Xc_test, yc_test
from scripts.statistical_modeling import (
    train_classification_models, evaluate_classification
)
clf_pipes = train_classification_models(Xc_train, yc_train, pre_clf)
clf_results = evaluate_classification(clf_pipes, Xc_test, yc_test)
display(clf_results)

# Define best_clf after evaluation
best_clf = clf_results["F1"].idxmax()

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1,ROC-AUC,PR-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LogisticRegression,0.83852,0.015579,0.904255,0.030629,0.92198,0.02446
RandomForest,0.996999,0.078125,0.00591,0.010989,0.72222,0.014499
XGBoost,0.900758,0.021609,0.771868,0.042041,0.929796,0.030442


In [16]:
# SHAP Feature Importance for Best Classifier
# This cell requires clf_pipes, best_clf, and Xc_test
import shap
import numpy as np

best_clf_pipe = clf_pipes[best_clf]

# Extract the trained model from the pipeline
# Use the correct key 'model' to access the model step
best_model = best_clf_pipe.named_steps['model']


# Preprocess the sampled data using the preprocessor from the pipeline
X_sample_clf = Xc_test.sample(500, random_state=1)
# Access the preprocessor step from the pipeline
preprocessor_step_clf = best_clf_pipe.named_steps['prep']
X_sample_clf_preprocessed = preprocessor_step_clf.transform(X_sample_clf)

# Convert preprocessed data to dense array if sparse, and ensure float type
if hasattr(X_sample_clf_preprocessed, 'toarray'):
    X_sample_clf_preprocessed = X_sample_clf_preprocessed.toarray().astype(float)
else:
     X_sample_clf_preprocessed = X_sample_clf_preprocessed.astype(float)


# Use TreeExplainer with the extracted model and the preprocessed sampled data
# Assuming the best classifier is a tree model (RandomForestClassifier or XGBClassifier)
shap_explainer_clf = shap.TreeExplainer(best_model)
shap_values_clf = shap_explainer_clf.shap_values(X_sample_clf_preprocessed)

# For classification, shap_values is often a list of arrays (one per class)
# For binary classification, we usually look at SHAP values for the positive class (index 1)
if isinstance(shap_values_clf, list):
    shap_values_clf = shap_values_clf[1] # Assuming positive class is at index 1

# Get feature names after preprocessing
# This requires accessing the feature names from the ColumnTransformer's output
try:
    # For scikit-learn 1.0 and later
    feature_names_out_clf = preprocessor_step_clf.get_feature_names_out()
except AttributeError:
    # For older versions, we might need to construct names manually
    ohe_feature_names_clf = preprocessor_step_clf.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(X_sample_clf[preprocessor_step_clf.transformers_[1][2]].columns)
    numeric_features_clf = preprocessor_step_clf.transformers_[0][2]
    feature_names_out_clf = np.concatenate([numeric_features_clf, ohe_feature_names_clf])


shap_importance_clf = pd.DataFrame({
    'Feature': feature_names_out_clf,
    'SHAP_Value': np.abs(shap_values_clf).mean(0)
})


shap_clf = shap_importance_clf.sort_values(by='SHAP_Value', ascending=False)

display(shap_clf.head(10))

Unnamed: 0,Feature,SHAP_Value
13,num__TotalPremium,4.140072
11,num__SumInsured,2.225782
12,num__CalculatedPremiumPerTerm,2.029695
0,num__UnderwrittenCoverID,1.782732
437,cat__Model_LT35 2.8TDi S/ROOF F/C P/V,1.734831
59,cat__Bank_Investec Bank,1.378569
366,cat__Model_DUCATO MH2 C8 F/C P/V,1.214564
581,cat__Model_VERSO 180 SX,1.198946
177,cat__make_JMC,1.076835
821,cat__CoverType_Basic Excess Waiver,1.062326


In [17]:
# Compute Risk‐Based Premium & Compare to CalculatedPremiumPerTerm
# This cell requires clf_pipes, best_clf, reg_pipes, best_reg, X_clf, and df
from scripts.statistical_modeling import compute_risk_based_premium # Import the function

premium_est = compute_risk_based_premium(clf_pipes[best_clf], reg_pipes[best_reg], X_clf)
df_compare = pd.DataFrame({
    "Calculated": df.CalculatedPremiumPerTerm,
    "ModelBased": premium_est
})
df_compare["diff"] = df_compare.ModelBased - df_compare.Calculated
display(df_compare.describe()) # Use display for better formatting

Unnamed: 0,Calculated,ModelBased,diff
count,999546.0,999546.0,999546.0
mean,116.161292,1872.331489,1756.170197
std,220.575965,6244.722195,6105.947424
min,0.0,0.000613,-2798.415342
25%,3.2257,0.577808,-5.827757
50%,8.4369,2.121605,-1.83105
75%,90.0,327.328037,172.305748
max,3051.8211,66903.102982,66457.323582
