In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

def load_data(file_path):
    return pd.read_csv(file_path)

def handle_missing_data(df):
    df = df.fillna(df.mean())
    df = df.dropna()
    return df

def encode_categorical_data(df, categorical_columns):
    encoder = OneHotEncoder(drop='first', sparse=False)
    encoded_cols = pd.DataFrame(encoder.fit_transform(df[categorical_columns]), columns=encoder.get_feature_names_out(categorical_columns))
    df = df.drop(categorical_columns, axis=1)
    df = pd.concat([df, encoded_cols], axis=1)
    return df

def feature_engineering(df):
    #feature based on existing data
    df['ClaimRatio'] = df['TotalClaims'] / df['TotalPremium']
    return df

def prepare_data(df, target_column, test_size=0.3):
    y = df[target_column]
    X = df.drop(columns=[target_column])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

# Load and prepare data
df = load_data('/home/yadasa/Desktop/InsuranceDataAnalysis/data/datasets/model_data.csv', low_memory=False)
df = handle_missing_data(df)
categorical_columns = ['Gender', 'Province']
df = encode_categorical_data(df, categorical_columns)
df = feature_engineering(df)
X_train, X_test, y_train, y_test = prepare_data(df, target_column='TotalClaims')


# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import time

def train_linear_regression(X_train, y_train):
    start_time = time.time()
    model = LinearRegression()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    return model, training_time

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return mse, r2

# Train and evaluate Linear Regression
lr_model, lr_time = train_linear_regression(X_train, y_train)
lr_mse, lr_r2 = evaluate_model(lr_model, X_test, y_test)


In [None]:
print("Linear Regression Model Evaluation:")
print("Mean Squared Error (MSE):", lr_mse)
print("R-squared (R2) Score:", lr_r2)
print("Training Time:", lr_time)


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def train_random_forest(X_train, y_train):
    start_time = time.time()
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    return model, training_time

def get_feature_importance(model, feature_names):
    feature_importances = model.feature_importances_
    return pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

# Train and evaluate Random Forest
rf_model, rf_time = train_random_forest(X_train, y_train)
rf_mse, rf_r2 = evaluate_model(rf_model, X_test, y_test)
rf_importance = get_feature_importance(rf_model, X_train.columns)


In [None]:
print("Linear Regression Model Evaluation:")
print("Mean Squared Error (MSE):", rf_mse)
print("R-squared (R2) Score:", rf_r2)
print("Training Time:", rf_time)

In [None]:
train_random_forest()

# XGBoost

In [None]:
from xgboost import XGBRegressor

def train_xgboost(X_train, y_train):
    start_time = time.time()
    model = XGBRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    return model, training_time

# Train and evaluate XGBoost
xgb_model, xgb_time = train_xgboost(X_train, y_train)
xgb_mse, xgb_r2 = evaluate_model(xgb_model, X_test, y_test)
xgb_importance = get_feature_importance(xgb_model, X_train.columns)


In [None]:
print("Linear Regression Model Evaluation:")
print("Mean Squared Error (MSE):", xgb_mse)
print("R-squared (R2) Score:", xgb_r2)
print("Training Time:", xgb_time)

# Hyperparameter Tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_param_grid, cv=3, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)
best_rf_model = rf_grid_search.best_estimator_
best_rf_params = rf_grid_search.best_params_
best_rf_mse, best_rf_r2 = evaluate_model(best_rf_model, X_test, y_test)
best_rf_importance = get_feature_importance(best_rf_model, X_train.columns)

# Hyperparameter tuning for XGBoost
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_grid_search = GridSearchCV(XGBRegressor(random_state=42), xgb_param_grid, cv=3, scoring='neg_mean_squared_error')
xgb_grid_search.fit(X_train, y_train)
best_xgb_model = xgb_grid_search.best_estimator_
best_xgb_params = xgb_grid_search.best_params_
best_xgb_mse, best_xgb_r2 = evaluate_model(best_xgb_model, X_test, y_test)
best_xgb_importance = get_feature_importance(best_xgb_model, X_train.columns)



# Feature Importance

In [None]:
import matplotlib.pyplot as plt

def plot_feature_importance(feature_importance, title):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='b', align='center')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.show()

plot_feature_importance(rf_importance, title="Random Forest Feature Importance")
plot_feature_importance(xgb_importance, title="XGBoost Feature Importance")


# Model Performance

In [None]:
pip install xgboost

In [None]:
def plot_model_performance(models, mse_scores, r2_scores, training_times):
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))

    axes[0].bar(models, mse_scores, color='blue')
    axes[0].set_title('Mean Squared Error')
    axes[0].set_xlabel('Models')
    axes[0].set_ylabel('MSE')

    axes[1].bar(models, r2_scores, color='green')
    axes[1].set_title('R^2 Score')
    axes[1].set_xlabel('Models')
    axes[1].set_ylabel('R^2')

    axes[2].bar(models, training_times, color='red')
    axes[2].set_title('Training Time (seconds)')
    axes[2].set_xlabel('Models')
    axes[2].set_ylabel('Time')

    plt.show()

models = ["Linear Regression", "Random Forest", "XGBoost"]
mse_scores = [lr_mse, best_rf_mse, best_xgb_mse]
r2_scores = [lr_r2, best_rf_r2, best_xgb_r2]
training_times = [lr_time, rf_time, xgb_time]

plot_model_performance(models, mse_scores, r2_scores, training_times)
