In [None]:
import pandas as pd

df = pd.read_excel('data/insurance_data.xls')

# Add HasClaim (binary target)
df['HasClaim'] = df['TotalClaims'] > 0
df['HasClaim'] = df['HasClaim'].astype(int)

# Add Margin
df['Margin'] = df['TotalPremium'] - df['TotalClaims']

df.head()


In [None]:

features = ['Province', 'PostalCode', 'Gender', 'Age', 'VehicleType', 'TotalPremium']

# One-hot encoding
df_encoded = pd.get_dummies(df[features], drop_first=True)

# Add target column
df_encoded['HasClaim'] = df['HasClaim']
df_encoded['TotalClaims'] = df['TotalClaims']


In [None]:
from sklearn.model_selection import train_test_split

# Split for classification model
X = df_encoded.drop(columns=['HasClaim', 'TotalClaims'])
y_class = df_encoded['HasClaim']

X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Train Random Forest
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
# Now use only clients with a claim
df_with_claims = df_encoded[df_encoded['HasClaim'] == 1]

X_reg = df_with_claims.drop(columns=['HasClaim', 'TotalClaims'])
y_reg = df_with_claims['TotalClaims']

# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

# Train Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

reg = RandomForestRegressor(random_state=42)
reg.fit(X_train_reg, y_train_reg)

# Predict
y_pred_reg = reg.predict(X_test_reg)

# Evaluate
print("RMSE:", mean_squared_error(y_test_reg, y_pred_reg, squared=False))
print("R^2 Score:", r2_score(y_test_reg, y_pred_reg))
