In [58]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import joblib
from sklearn.decomposition import PCA

This is a list of our import statements needed if I get rid of most of the redunant / unneeded imports.

In [59]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Allstate1B/claims_data.csv')
df.head()
df.columns

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Index(['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'loss'],
      dtype='object', length=132)

We mounted the drive, and read the csv files.

In [60]:
cat_feats = ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
cont_feats = ['cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14']

encoder = OneHotEncoder(drop='first', sparse_output=False)
df_encoded = encoder.fit_transform(df[cat_feats])
df_encoded = pd.DataFrame(df_encoded)

We defined categorical and continuous features, then did a simple one-hot encoding process.

In [61]:
df_numeric = df[cont_feats]
df_combined = pd.concat([df_numeric, df_encoded], axis=1)
df_combined['loss'] = df['loss']
threshold_continuous = 0.45  # For continuous variables
threshold_categorical = 0.30  # For categorical variables
correlation_matrix = df_combined.corr()

Combine continuous and encoded categorical features.

In [62]:
selected_continuous_features = correlation_matrix.loc[cont_feats, 'loss'].abs()
selected_continuous_features = selected_continuous_features[selected_continuous_features > threshold_continuous].index.tolist()

selected_categorical_features = correlation_matrix.loc[df_encoded.columns, 'loss'].abs()
selected_categorical_features = selected_categorical_features[selected_categorical_features > threshold_categorical].index.tolist()

selected_features = selected_continuous_features + selected_categorical_features

Selected features with correlation above our experimental correlation threshold value.

In [63]:
df_selected = df_combined[selected_features + ['loss']]

X = df_selected.drop('loss', axis=1)
y = df_selected['loss']

Create a DataFrame with selected features and target variable, 'loss'.

In [64]:
X_train, X_test, y_train, y_test = joblib.load('/content/drive/MyDrive/Allstate1B/Code/train_test_split.joblib')

In [65]:
model = xgb.XGBRegressor(n_estimators=125, learning_rate=0.05, eval_metric='rmse')

very basic xgboost model.

In [66]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [67]:
mse = mean_squared_error(y_test, y_pred)
variance = y_test.var()
normalized_mse = mse / variance

print(f"Mean Squared Error: {mse}")
print(f"Normalized Mean Squared Error: {normalized_mse}")

Mean Squared Error: 5833675.980940211
Normalized Mean Squared Error: 0.7149273562837057


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# Define the parameter grid for Randomized Search
param_dist = {
    'n_estimators': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'subsample': uniform(0.5, 1.0),
    'colsample_bytree': uniform(0.5, 1.0)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(),
    param_distributions=param_dist,
    n_iter=50,  # Number of different combinations to try
    scoring='neg_mean_squared_error',
    cv=3,  # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit RandomizedSearchCV to find the best model
random_search.fit(X_train, y_train)

# Get the best parameters from the search
print(f"Best Parameters: {random_search.best_params_}")

# Make predictions using the best estimator
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error after tuning: {mse}")

Fitting 3 folds for each of 50 candidates, totalling 150 fits
