In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import joblib
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/claims_data.csv')

The function encode and cluster makes a hierachy for the categorical columns that have 4 or more variables.

In [4]:
def encode_and_cluster(df, column_name, n_clusters=5):
    """Encodes and clusters the categorical column, returning the grouped version."""
    le = LabelEncoder()
    df[f'{column_name}_encoded'] = le.fit_transform(df[column_name])

    # Use KMeans clustering to group the encoded values
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df[f'{column_name}_grouped'] = kmeans.fit_predict(df[[f'{column_name}_encoded']])

    # Clean up temporary encoded column
    df.drop([f'{column_name}_encoded'], axis=1, inplace=True)

    return df[[f'{column_name}_grouped']]  # Return only the new grouped column


In [5]:
# def encode_and_cluster(df, column_name, n_clusters=5):
#     """Encodes and clusters the categorical column, returning the grouped version."""
#     le = LabelEncoder()
#     df[f'{column_name}_encoded'] = le.fit_transform(df[column_name])

#      # Use KMeans clustering to group the encoded values
#     kmeans = KMeans(n_clusters=n_clusters, random_state=42)
#     df[f'{column_name}_cluster'] = kmeans.fit_predict(df[[f'{column_name}_encoded']])

#     # Map cluster numbers to hierarchical group labels
#     df[f'{column_name}_grouped'] = df[f'{column_name}_cluster'].map(lambda x: f'Group{x}')

#     # Clean up temporary columns
#     df.drop([f'{column_name}_encoded', f'{column_name}_cluster'], axis=1, inplace=True)

#     return df

These are the categorical columns that have a lot of variance, but have many variables. Due to this these categrical columsn have to be split into a heirachy.

In [6]:
columns_to_cluster = [
    "cat89", "cat101", "cat102", "cat105",
    "cat107", 'cat113', 'cat115', 'cat116'
]


In [7]:
# Create an empty DataFrame to store grouped columns
grouped_df = pd.DataFrame()

In [8]:
# Loop through each column, generate the grouped column, and add it to grouped_df
for column in columns_to_cluster:
    if column in df.columns:
        grouped_col = encode_and_cluster(df, column)
        grouped_df = pd.concat([grouped_df, grouped_col], axis=1)

In [9]:
# Step 4: Add the top continuous and categroical features that did not need to be grouped into heirchies
continuous_features = ['cont2', 'cont12', 'cont14', 'cont11', 'cont9']
cat_features = ['cat7', 'cat57']

In [10]:
# Encode cat7 and cat57 as numerical columns using LabelEncoder
le = LabelEncoder()
df['cat7_encoded'] = le.fit_transform(df['cat7'])
df['cat57_encoded'] = le.fit_transform(df['cat57'])
encoded_cat_features = ['cat7_encoded', 'cat57_encoded']

In [11]:
# # Ensure the selected features exist in the DataFrame
# all_selected_features = continuous_features + cat_features
# for feature in all_selected_features:
#     if feature not in df.columns:
#         raise ValueError(f"Feature '{feature}' not found in the dataset.")

In [12]:
# for column in columns_to_cluster:
#     if column in df.columns:
#         df = encode_and_cluster(df, column)


In [13]:
# Step 5: Merge grouped columns back into the original dataset
# df_final = pd.concat([df[all_selected_features], grouped_df], axis=1)
df_final = pd.concat([df[continuous_features + encoded_cat_features], grouped_df], axis=1)

In [14]:
# Check columns and data types
print("Columns in the final DataFrame:", df_final.columns)
print(df_final.dtypes)

Columns in the final DataFrame: Index(['cont2', 'cont12', 'cont14', 'cont11', 'cont9', 'cat7_encoded',
       'cat57_encoded', 'cat89_grouped', 'cat101_grouped', 'cat102_grouped',
       'cat105_grouped', 'cat107_grouped', 'cat113_grouped', 'cat115_grouped',
       'cat116_grouped'],
      dtype='object')
cont2             float64
cont12            float64
cont14            float64
cont11            float64
cont9             float64
cat7_encoded        int64
cat57_encoded       int64
cat89_grouped       int32
cat101_grouped      int32
cat102_grouped      int32
cat105_grouped      int32
cat107_grouped      int32
cat113_grouped      int32
cat115_grouped      int32
cat116_grouped      int32
dtype: object


In [15]:
# Verify that grouped columns were added
print("Columns in dataset after adding grouped columns:")
print(df_final.columns)

Columns in dataset after adding grouped columns:
Index(['cont2', 'cont12', 'cont14', 'cont11', 'cont9', 'cat7_encoded',
       'cat57_encoded', 'cat89_grouped', 'cat101_grouped', 'cat102_grouped',
       'cat105_grouped', 'cat107_grouped', 'cat113_grouped', 'cat115_grouped',
       'cat116_grouped'],
      dtype='object')


In [16]:
# Step 6: Prepare the feature set and target variable
X = df_final
y = df['loss']

In [17]:
# Step 7: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Save the train-test split data
joblib.dump((X_train, X_test, y_train, y_test), 'train_test_split.joblib')

['train_test_split.joblib']