<a href="https://colab.research.google.com/github/YoshiAligina/Allstate1B/blob/main/allstate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Drive Mounting & Imports**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import pickle
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Allstate1B/claims_data.csv')
df.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [None]:
label_column = "loss"
cont_feats =  ['cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14']
for feature in cont_feats:
    sns.scatterplot(x=feature, y=label_column, data=df)
    plt.title(f'Scatter plot of {feature} vs {label_column}')
    plt.xlabel(feature)
    plt.ylabel(label_column)
    plt.show()

In [None]:
nan_counts = df.isnull().sum()
nan_counts
label_column
cat_feats= ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9']
for feature in cat_feats:
    sns.barplot(x=feature, y=label_column, data=df, estimator=sum)
    plt.title(f'Bar plot of {feature} vs {label_column}')
    plt.xlabel(feature)
    plt.ylabel(label_column)
    plt.show()

In [None]:
feats = cont_feats + cat_feats

df_encoded = pd.get_dummies(df[cat_feats])
df_numeric = df[cont_feats]
df_combined = pd.concat([df_numeric, df_encoded], axis=1)

df_combined[label_column] = df[label_column]

correlation_matrix = df_combined.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix[[label_column]].sort_values(by=label_column, ascending=False), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix for Loss Label')
plt.show()

Categorical Feature Selection

In [None]:
continuous_data = df.select_dtypes(include=['float64', 'int64'])

print(continuous_data.head())

In [None]:
X = df.select_dtypes(include=['float64', 'int64']).drop(columns=['id','loss'])
y = df['loss']


selector = SelectKBest(score_func=mutual_info_regression, k='all')
X_new = selector.fit_transform(X, y)


feature_scores = selector.scores_


feature_ranking = pd.DataFrame({'Feature': X.columns, 'Score': feature_scores})
feature_ranking = feature_ranking.sort_values(by='Score', ascending=False)
print(feature_ranking.to_string(index=False))

In [None]:
top_n = 5
top_features = feature_ranking.head(top_n)['Feature']

top_features_list = top_features.tolist()

print("Top 5 continuous features:")
print(top_features_list)

Continuous Feature Selection

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
df_encoded = pd.get_dummies(df, columns=categorical_columns)
y = df_encoded["loss"]
X = df_encoded.drop(columns=['id', 'loss'], axis=1)
# X
# y

In [None]:

from sklearn.feature_selection import SelectKBest, f_regression


selector = SelectKBest(score_func=f_regression, k=10)
X_selected = selector.fit_transform(X, y)


selected_indices = selector.get_support(indices=True)

selected_features = X.columns[selected_indices]
print("Selected Features:", selected_features)

X_kbest = X[selected_features]

In [None]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

for column in categorical_columns:
    mean_loss = df.groupby(column)['loss'].mean().reset_index()


    plt.figure(figsize=(10, 6))
    sns.barplot(x=column, y='loss', data=mean_loss, color='lightblue', label='Mean Loss')


    sns.lineplot(x=column, y='loss', data=df, color='red', marker='o', label='Actual Loss')

    plt.title(f'Bar Graph of Mean Loss with Actual Loss Line for {column}')
    plt.legend()
    plt.xticks(rotation=45)  # Rotate x-axis labels if needed
    plt.show()

In [None]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

significant_difference_columns = []

for column in categorical_columns:

    mean_loss = df.groupby(column)['loss'].mean().reset_index()


    mean = mean_loss['loss'].mean()
    std_dev = mean_loss['loss'].std()
    cv = std_dev / mean if mean != 0 else 0

    threshold = 0.4

    if cv > threshold:
        significant_difference_columns.append(column)

        plt.figure(figsize=(10, 6))
        sns.barplot(x=column, y='loss', data=mean_loss, color='lightblue', label='Mean Loss')

        sns.lineplot(x=column, y='loss', data=df, color='red', marker='o', label='Actual Loss')

        plt.title(f'Bar Graph of Mean Loss with Actual Loss Line for {column}')
        plt.legend()
        plt.xticks(rotation=45)
        plt.show()


print("Categorical columns with significant differences in mean loss:")
print(significant_difference_columns)

In [None]:
columns_to_cluster = ["cat89", "cat92","cat99","cat101", "cat102", "cat105", "cat107",'cat109', 'cat110',"cat111",'cat112', 'cat113', 'cat114', 'cat115', 'cat116']

new_columns = [col for col in significant_difference_columns if col not in columns_to_cluster]

print("Categorical columns with significant differences in mean loss to use:")
new_columns

In [None]:
number_of_columns=len(new_columns)

number_of_columns

In [None]:

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

def encode_and_cluster(df, column_name, n_clusters=5):
    le = LabelEncoder()
    df[f'{column_name}_encoded'] = le.fit_transform(df[column_name])

    kmeans = KMeans(n_clusters=n_clusters)
    df[f'{column_name}_cluster'] = kmeans.fit_predict(df[[f'{column_name}_encoded']])

    df[f'{column_name}_grouped'] = df[f'{column_name}_cluster'].map(lambda x: f'Group{x}')

    return df


columns_to_cluster = ["cat89", "cat92","cat99","cat101", "cat102", "cat105", "cat107",'cat109', 'cat110',"cat11",'cat112', 'cat113', 'cat114', 'cat115', 'cat116']


for column in columns_to_cluster:
    df = encode_and_cluster(df, column, n_clusters=5)


for column in columns_to_cluster:
    grouped_column = f'{column}_grouped'
    print(f"\nDistribution of {column} groups:")
    print(df[grouped_column].value_counts())

In [None]:
def plot_mean_loss_by_group(df, grouped_column):

    mean_loss = df.groupby(grouped_column)['loss'].mean().reset_index()


    plt.figure(figsize=(12, 6))

    sns.barplot(x=grouped_column, y='loss', data=mean_loss, color='lightblue', label='Mean Loss')


    sns.lineplot(x=grouped_column, y='loss', data=mean_loss, color='red', marker='o', label='Actual Loss')


    plt.title(f'Bar Graph of Mean Loss with Actual Loss Line for Hierarchical Groups ({grouped_column})')
    plt.xlabel(f'Hierarchical Groups ({grouped_column})')
    plt.ylabel('Loss')
    plt.xticks(rotation=45)
    plt.legend()
    plt.show()

def encode_and_cluster(df, column_name, n_clusters=5):
    le = LabelEncoder()
    df[f'{column_name}_encoded'] = le.fit_transform(df[column_name])

    kmeans = KMeans(n_clusters=n_clusters)
    df[f'{column_name}_cluster'] = kmeans.fit_predict(df[[f'{column_name}_encoded']])

    df[f'{column_name}_grouped'] = df[f'{column_name}_cluster'].map(lambda x: f'Group{x}')

    return df


columns_to_cluster = ["cat89", "cat92", "cat99", "cat101", "cat102", "cat105", "cat107", 'cat109', 'cat110', "cat111", 'cat112', 'cat113', 'cat114', 'cat115', 'cat116']

for column in columns_to_cluster:
    df = encode_and_cluster(df, column, n_clusters=5)


for column in columns_to_cluster:
    grouped_column = f'{column}_grouped'
    if grouped_column in df.columns:
        plot_mean_loss_by_group(df, grouped_column)

In [None]:

grouped_cat = ["cat92_grouped","cat101_grouped","cat111_grouped","cat114_grouped"]
columns_to_include = new_columns + grouped_cat
columns_to_include

Feature Subset

In [None]:

def encode_and_cluster(df, column_name, n_clusters=5):
    """Encodes and clusters the categorical column, returning the grouped version."""
    le = LabelEncoder()
    df[f'{column_name}_encoded'] = le.fit_transform(df[column_name])

    # Use KMeans clustering to group the encoded values
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df[f'{column_name}_grouped'] = kmeans.fit_predict(df[[f'{column_name}_encoded']])

    # Clean up temporary encoded column
    df.drop([f'{column_name}_encoded'], axis=1, inplace=True)

    return df[[f'{column_name}_grouped']]  # Return only the new grouped column

In [None]:
columns_to_cluster = [
    "cat89", "cat101", "cat102", "cat105",
    "cat107", 'cat113', 'cat115', 'cat116'
]

In [None]:

# Create an empty DataFrame to store grouped columns
grouped_df = pd.DataFrame()

In [None]:

# Loop through each column, generate the grouped column, and add it to grouped_df
for column in columns_to_cluster:
    if column in df.columns:
        grouped_col = encode_and_cluster(df, column)
        grouped_df = pd.concat([grouped_df, grouped_col], axis=1)


# Step 4: Add the top continuous and categroical features that did not need to be grouped into heirchies
continuous_features = ['cont2', 'cont12', 'cont14', 'cont11', 'cont9']
cat_features = ['cat7', 'cat57']


# Encode cat7 and cat57 as numerical columns using LabelEncoder
le = LabelEncoder()
df['cat7_encoded'] = le.fit_transform(df['cat7'])
df['cat57_encoded'] = le.fit_transform(df['cat57'])
encoded_cat_features = ['cat7_encoded', 'cat57_encoded']

In [None]:
df_final = pd.concat([df[continuous_features + encoded_cat_features], grouped_df], axis=1)

# Check columns and data types
print("Columns in the final DataFrame:", df_final.columns)
print(df_final.dtypes)

In [None]:

# Verify that grouped columns were added
print("Columns in dataset after adding grouped columns:")
print(df_final.columns)

Columns in dataset after adding grouped columns:
Index(['cont2', 'cont12', 'cont14', 'cont11', 'cont9', 'cat7_encoded',
       'cat57_encoded', 'cat89_grouped', 'cat101_grouped', 'cat102_grouped',
       'cat105_grouped', 'cat107_grouped', 'cat113_grouped', 'cat115_grouped',
       'cat116_grouped'],
      dtype='object')

# Step 6: Prepare the feature set and target variable
X = df_final
y = df['loss']


# Step 7: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Save the train-test split data
joblib.dump((X_train, X_test, y_train, y_test), 'train_test_split.joblib')

Gradient Boosting

In [None]:
X_train, X_test, y_train, y_test = joblib.load('/content/drive/MyDrive/train_test_split.joblib')


In [None]:
params = {
                                "n_estimators":200,  # Number of boosting stages
                                "max_depth":3,      # Depth of each tree
                                  "learning_rate":0.01, # Learning rate for model updates
          }


model = GradientBoostingRegressor(**params)


model.fit(X_train, y_train)

In [None]:
random_search = RandomizedSearchCV(model, param_distributions=params,
                                   n_iter=10, scoring='r2', cv=5, random_state=42)


y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)


print(f"R² Score: {r2}")

In [None]:
# Save the plot with a timestamp in the filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # Format: YYYYMMDD_HHMMSS
directory = r"C:\Users\Maryl\OneDrive\Allstate AI\Graphs"
version_folder = os.path.join(directory, f'Gradient_Boosting_Machine_{timestamp}')
os.makedirs(version_folder)
file_path= os.path.join(version_folder, '{}_Gradient_Boosting_Machine'.format(timestamp))

# savetime = datetime.now().strftime("%Y%m%d%H%M")
# #joblib.dump(youden_index, 'ML Prediction Results/{}_youden_index-v2.pkl'.format(savetime))
# directory = 'ML Prediction Results'
# version_folder = os.path.join(directory, f'SVM_Version_{savetime}')
# os.makedirs(version_folder)
# file_path = os.path.join(version_folder, '{}_youden_index-v2.pkl'.format(savetime))
# joblib.dump(youden_index,file_path)


# plt.savefig(f'predicted_vs_actual_{timestamp}.png')  # Save as PNG
# #plot is not saving, figure out why


# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Perfect Prediction')  # Diagonal line
plt.xlim([y_test.min(), y_test.max()])
plt.ylim([y_test.min(), y_test.max()])
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Predicted vs Actual Values\nR² Score: {r2:.2f}')
plt.legend()
plt.grid()
plt.savefig(os.path.join(version_folder, '{}_R2_graph.png'.format(timestamp)), transparent = False, dpi = 650, bbox_inches = "tight")


In [None]:
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")
plt.show()

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

# Scatter plot for True vs Predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='b')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title(f"True vs Predicted Values (RMSE = {rmse:.2f})")
plt.savefig(os.path.join(version_folder, '{}_RMSE_plot.png'.format(timestamp)),transparent = False, dpi = 650, bbox_inches = "tight")
# plt.savefig(os.path.join(version_folder, '{}_predicted_vs_actual.png'.format(savetime)),
# plt.savefig(f'predicted_vs_actual_{timestamp}.png', bbox_inches='tight')

plt.show()

In [None]:
# Calculate residuals
residual = y_pred - y_test
print(f"Residual Value (preicted value - the actal value): {residual}")


# Plot residuals
plt.figure(figsize=(8, 6))
plt.scatter(y_test, residual, alpha=0.5, color='purple')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("True Values")
plt.ylabel("Residuals (Predicted - Actual)")
plt.title("Residuals of Predictions")
plt.savefig(os.path.join(version_folder, '{}_residuals_plot_'.format(timestamp)), transparent = False, dpi = 650, bbox_inches = "tight")
# plt.savefig(f'residuals_plot_{timestamp}.png', bbox_inches='tight')
plt.show()

XG Boost Maryln

In [None]:
X_train, X_test, y_train, y_test = joblib.load('/content/drive/MyDrive/train_test_split.joblib')
# X_train, X_test, y_train, y_test = joblib.load('/train_test_split.joblib')


import os
file_path = '/train_test_split.joblib'

if os.path.exists(file_path):
    X_train, X_test, y_train, y_test = joblib.load(file_path)
else:
    print("File not found. Please check the file path.")


In [None]:
#Define the model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

In [None]:
# Hyperparameter grid
param_dist = {
    'n_estimators':  [100, 200, 300],
    'max_depth': [3, 5, 7, 9, 11, 13],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.3, 0.4, 0.5]
}

In [None]:
#Randomized search
random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=10, scoring='r2', cv=5, random_state=42)


random_search.fit(X_train, y_train)

In [None]:
# Get the best model
best_model = random_search.best_estimator_


# Evaluate the model
y_pred = best_model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


# Save the plot with a timestamp in the filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # Format: YYYYMMDD_HHMMSS
directory = r"C:\Users\Maryl\OneDrive\Allstate AI\Graphs"
version_folder = os.path.join(directory, f'XGBoost_{timestamp}')
os.makedirs(version_folder)
file_path= os.path.join(version_folder, '{}_XGBoost'.format(timestamp))


# Create a scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6, label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label='Perfect Prediction')  # Diagonal line
plt.xlim([y_test.min(), y_test.max()])
plt.ylim([y_test.min(), y_test.max()])
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title(f'Predicted vs Actual Values\nR² Score: {r2:.2f}')
plt.legend()
plt.grid()
plt.savefig(os.path.join(version_folder, '{}_R2_graph.png'.format(timestamp)), transparent = False, dpi = 650, bbox_inches = "tight")



In [None]:
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")
plt.show()

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

# Scatter plot for True vs Predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='b')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title(f"True vs Predicted Values (RMSE = {rmse:.2f})")
# plt.savefig(f'predicted_vs_actual_{timestamp}.png', bbox_inches='tight')
plt.savefig(os.path.join(version_folder, '{}_RMSE_plot.png'.format(timestamp)),transparent = False, dpi = 650, bbox_inches = "tight")


plt.show()

In [None]:

# Calculate residuals
residual = y_pred - y_test
print(f"Residual Value (preicted value - the actal value): {residual}")


# Plot residuals
plt.figure(figsize=(8, 6))
plt.scatter(y_test, residual, alpha=0.5, color='purple')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("True Values")
plt.ylabel("Residuals (Predicted - Actual)")
plt.title("Residuals of Predictions")
# plt.savefig(f'residuals_plot_{timestamp}.png', bbox_inches='tight')
plt.savefig(os.path.join(version_folder, '{}_residuals_plot_'.format(timestamp)), transparent = False, dpi = 650, bbox_inches = "tight")
plt.show()

Linear Regression

In [None]:

import pandas as pd
import category_encoders as ce
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
filename = "claims_data.csv"
df = pd.read_csv(filename, header=0)
df.head(20)
id	cat1	cat2	cat3	cat4	cat5	cat6	cat7	cat8	cat9	...	cont6	cont7	cont8	cont9	cont10	cont11	cont12	cont13	cont14	loss
0	1	A	B	A	B	A	A	A	A	B	...	0.718367	0.335060	0.30260	0.67135	0.83510	0.569745	0.594646	0.822493	0.714843	2213.18
1	2	A	B	A	A	A	A	A	A	B	...	0.438917	0.436585	0.60087	0.35127	0.43919	0.338312	0.366307	0.611431	0.304496	1283.60
2	5	A	B	A	A	B	A	A	A	B	...	0.289648	0.315545	0.27320	0.26076	0.32446	0.381398	0.373424	0.195709	0.774425	3005.09
3	10	B	B	A	B	A	A	A	A	B	...	0.440945	0.391128	0.31796	0.32128	0.44467	0.327915	0.321570	0.605077	0.602642	939.85
4	11	A	B	A	B	A	A	A	A	B	...	0.178193	0.247408	0.24564	0.22089	0.21230	0.204687	0.202213	0.246011	0.432606	2763.85
5	13	A	B	A	A	A	A	A	A	B	...	0.364464	0.401162	0.26847	0.46226	0.50556	0.366788	0.359249	0.345247	0.726792	5142.87
6	14	A	A	A	A	B	A	A	A	A	...	0.381515	0.363768	0.24564	0.40455	0.47225	0.334828	0.352251	0.342239	0.382931	1132.22
7	20	A	B	A	B	A	A	A	A	B	...	0.867021	0.583389	0.90267	0.84847	0.80218	0.644013	0.785706	0.859764	0.242416	3585.75
8	23	A	B	B	B	B	A	A	A	B	...	0.628534	0.384099	0.61229	0.38249	0.51111	0.682315	0.669033	0.756454	0.361191	10280.20
9	24	A	B	A	A	B	B	A	A	B	...	0.713343	0.469223	0.30260	0.67135	0.83510	0.863052	0.879347	0.822493	0.294523	6184.59
10	25	A	B	A	A	A	A	A	A	B	...	0.429383	0.877905	0.39455	0.53565	0.50556	0.550529	0.538473	0.336261	0.715009	6396.85
11	33	A	B	A	A	B	A	A	A	B	...	0.314683	0.370419	0.58354	0.46226	0.38016	0.644013	0.665644	0.339244	0.799124	5965.73
12	34	B	A	A	A	B	A	A	A	A	...	0.408772	0.363312	0.32843	0.32128	0.44467	0.327915	0.321570	0.605077	0.818358	1193.05
13	41	B	A	A	A	B	B	A	A	A	...	0.241574	0.255339	0.58934	0.32496	0.26029	0.257148	0.253044	0.276878	0.477578	1071.77
14	47	A	A	A	A	B	A	A	A	A	...	0.894903	0.586433	0.80058	0.93383	0.78770	0.880469	0.871011	0.822493	0.251278	585.18
15	48	A	A	A	A	B	B	A	A	A	...	0.570733	0.547756	0.80438	0.44352	0.63026	0.385085	0.377003	0.516660	0.340325	1395.45
16	49	A	B	B	A	A	A	A	A	B	...	0.411902	0.593548	0.31796	0.38846	0.48889	0.457203	0.447145	0.301535	0.205651	6609.32
17	51	A	A	A	A	A	B	A	A	A	...	0.688705	0.437192	0.67263	0.83505	0.59334	0.678924	0.665644	0.684242	0.407411	2658.70
18	52	A	A	B	A	A	B	A	A	A	...	0.443265	0.637086	0.36636	0.52938	0.39068	0.678924	0.665644	0.304350	0.310796	4167.32
19	55	A	A	A	B	A	A	A	A	A	...	0.436312	0.544355	0.48864	0.36285	0.20496	0.388786	0.406090	0.648701	0.830931	3797.89
20 rows × 132 columns

# UNIVARIATE PLOTTING OF LOSS CATEGORY
sns.histplot(data=df, x="loss")
<Axes: xlabel='loss', ylabel='Count'>

df.columns
Index(['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'loss'],
      dtype='object', length=132)
# NO MISSING VALUES!
df.isnull().sum()
id        0
cat1      0
cat2      0
cat3      0
cat4      0
         ..
cont11    0
cont12    0
cont13    0
cont14    0
loss      0
Length: 132, dtype: int64
df.dtypes
id          int64
cat1       object
cat2       object
cat3       object
cat4       object
           ...
cont11    float64
cont12    float64
cont13    float64
cont14    float64
loss      float64
Length: 132, dtype: object
to_encode = list(df.select_dtypes(include=['object']).columns)
to_encode
['cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18',
 'cat19',
 'cat20',
 'cat21',
 'cat22',
 'cat23',
 'cat24',
 'cat25',
 'cat26',
 'cat27',
 'cat28',
 'cat29',
 'cat30',
 'cat31',
 'cat32',
 'cat33',
 'cat34',
 'cat35',
 'cat36',
 'cat37',
 'cat38',
 'cat39',
 'cat40',
 'cat41',
 'cat42',
 'cat43',
 'cat44',
 'cat45',
 'cat46',
 'cat47',
 'cat48',
 'cat49',
 'cat50',
 'cat51',
 'cat52',
 'cat53',
 'cat54',
 'cat55',
 'cat56',
 'cat57',
 'cat58',
 'cat59',
 'cat60',
 'cat61',
 'cat62',
 'cat63',
 'cat64',
 'cat65',
 'cat66',
 'cat67',
 'cat68',
 'cat69',
 'cat70',
 'cat71',
 'cat72',
 'cat73',
 'cat74',
 'cat75',
 'cat76',
 'cat77',
 'cat78',
 'cat79',
 'cat80',
 'cat81',
 'cat82',
 'cat83',
 'cat84',
 'cat85',
 'cat86',
 'cat87',
 'cat88',
 'cat89',
 'cat90',
 'cat91',
 'cat92',
 'cat93',
 'cat94',
 'cat95',
 'cat96',
 'cat97',
 'cat98',
 'cat99',
 'cat100',
 'cat101',
 'cat102',
 'cat103',
 'cat104',
 'cat105',
 'cat106',
 'cat107',
 'cat108',
 'cat109',
 'cat110',
 'cat111',
 'cat112',
 'cat113',
 'cat114',
 'cat115',
 'cat116']
# Why do some categories have more than 2 possible values? What do the values stand for?
df[to_encode].nunique()
cat1        2
cat2        2
cat3        2
cat4        2
cat5        2
         ...
cat112     51
cat113     61
cat114     19
cat115     23
cat116    326
Length: 116, dtype: int64
top_10 = list(df['cat112'].value_counts().head(60).index)

top_10
['E',
 'AH',
 'AS',
 'J',
 'AF',
 'AN',
 'N',
 'U',
 'AV',
 'AK',
 'K',
 'AI',
 'S',
 'AP',
 'G',
 'F',
 'AW',
 'A',
 'AR',
 'C',
 'O',
 'D',
 'AD',
 'AY',
 'Y',
 'AG',
 'AT',
 'AA',
 'AM',
 'AL',
 'R',
 'AX',
 'I',
 'X',
 'AE',
 'Q',
 'V',
 'H',
 'AO',
 'T',
 'L',
 'W',
 'AC',
 'M',
 'AU',
 'B',
 'P',
 'AB',
 'BA',
 'AJ',
 'AQ']

categorical_columns = df.select_dtypes(include = ['object']).columns

catlist = categorical_columns.tolist()

#encoded_columns = pd.DataFrame()

for col in catlist:
    m_estimate_encoder = ce.MEstimateEncoder(cols=[col], m=5)
    df[col] = m_estimate_encoder.fit_transform(df[col], df['loss'])
    #encoded_columns = pd.concat([encoded_columns, encoded_col], axis=1)

#df = df.drop(columns = categorical_columns)

#df = pd.concat([df, encoded_columns], axis=1)

#df= df_encoded.astype(float)

df.head(20)
id	cat1	cat2	cat3	cat4	cat5	cat6	cat7	cat8	cat9	...	cont6	cont7	cont8	cont9	cont10	cont11	cont12	cont13	cont14	loss
0	1	3408.078419	3800.057434	2902.223547	3488.354592	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.718367	0.335060	0.30260	0.67135	0.83510	0.569745	0.594646	0.822493	0.714843	2213.18
1	2	3408.078419	3800.057434	2902.223547	2826.835029	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.438917	0.436585	0.60087	0.35127	0.43919	0.338312	0.366307	0.611431	0.304496	1283.60
2	5	3408.078419	3800.057434	2902.223547	2826.835029	3463.976775	3259.907946	2908.947835	2975.560301	3827.569429	...	0.289648	0.315545	0.27320	0.26076	0.32446	0.381398	0.373424	0.195709	0.774425	3005.09
3	10	1915.318476	3800.057434	2902.223547	3488.354592	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.440945	0.391128	0.31796	0.32128	0.44467	0.327915	0.321570	0.605077	0.602642	939.85
4	11	3408.078419	3800.057434	2902.223547	3488.354592	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.178193	0.247408	0.24564	0.22089	0.21230	0.204687	0.202213	0.246011	0.432606	2763.85
5	13	3408.078419	3800.057434	2902.223547	2826.835029	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.364464	0.401162	0.26847	0.46226	0.50556	0.366788	0.359249	0.345247	0.726792	5142.87
6	14	3408.078419	2454.167166	2902.223547	2826.835029	3463.976775	3259.907946	2908.947835	2975.560301	2512.032345	...	0.381515	0.363768	0.24564	0.40455	0.47225	0.334828	0.352251	0.342239	0.382931	1132.22
7	20	3408.078419	3800.057434	2902.223547	3488.354592	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.867021	0.583389	0.90267	0.84847	0.80218	0.644013	0.785706	0.859764	0.242416	3585.75
8	23	3408.078419	3800.057434	5365.512568	3488.354592	3463.976775	3259.907946	2908.947835	2975.560301	3827.569429	...	0.628534	0.384099	0.61229	0.38249	0.51111	0.682315	0.669033	0.756454	0.361191	10280.20
9	24	3408.078419	3800.057434	2902.223547	2826.835029	3463.976775	2519.731150	2908.947835	2975.560301	3827.569429	...	0.713343	0.469223	0.30260	0.67135	0.83510	0.863052	0.879347	0.822493	0.294523	6184.59
10	25	3408.078419	3800.057434	2902.223547	2826.835029	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.429383	0.877905	0.39455	0.53565	0.50556	0.550529	0.538473	0.336261	0.715009	6396.85
11	33	3408.078419	3800.057434	2902.223547	2826.835029	3463.976775	3259.907946	2908.947835	2975.560301	3827.569429	...	0.314683	0.370419	0.58354	0.46226	0.38016	0.644013	0.665644	0.339244	0.799124	5965.73
12	34	1915.318476	2454.167166	2902.223547	2826.835029	3463.976775	3259.907946	2908.947835	2975.560301	2512.032345	...	0.408772	0.363312	0.32843	0.32128	0.44467	0.327915	0.321570	0.605077	0.818358	1193.05
13	41	1915.318476	2454.167166	2902.223547	2826.835029	3463.976775	2519.731150	2908.947835	2975.560301	2512.032345	...	0.241574	0.255339	0.58934	0.32496	0.26029	0.257148	0.253044	0.276878	0.477578	1071.77
14	47	3408.078419	2454.167166	2902.223547	2826.835029	3463.976775	3259.907946	2908.947835	2975.560301	2512.032345	...	0.894903	0.586433	0.80058	0.93383	0.78770	0.880469	0.871011	0.822493	0.251278	585.18
15	48	3408.078419	2454.167166	2902.223547	2826.835029	3463.976775	2519.731150	2908.947835	2975.560301	2512.032345	...	0.570733	0.547756	0.80438	0.44352	0.63026	0.385085	0.377003	0.516660	0.340325	1395.45
16	49	3408.078419	3800.057434	5365.512568	2826.835029	2814.657333	3259.907946	2908.947835	2975.560301	3827.569429	...	0.411902	0.593548	0.31796	0.38846	0.48889	0.457203	0.447145	0.301535	0.205651	6609.32
17	51	3408.078419	2454.167166	2902.223547	2826.835029	2814.657333	2519.731150	2908.947835	2975.560301	2512.032345	...	0.688705	0.437192	0.67263	0.83505	0.59334	0.678924	0.665644	0.684242	0.407411	2658.70
18	52	3408.078419	2454.167166	5365.512568	2826.835029	2814.657333	2519.731150	2908.947835	2975.560301	2512.032345	...	0.443265	0.637086	0.36636	0.52938	0.39068	0.678924	0.665644	0.304350	0.310796	4167.32
19	55	3408.078419	2454.167166	2902.223547	3488.354592	2814.657333	3259.907946	2908.947835	2975.560301	2512.032345	...	0.436312	0.544355	0.48864	0.36285	0.20496	0.388786	0.406090	0.648701	0.830931	3797.89
20 rows × 132 columns

top_10 = list(df['cat112'].value_counts().head(60).index)

top_10
[3208.3752740598875,
 3476.4887378474764,
 3409.8779698103626,
 2832.8601330146266,
 3426.6158314764057,
 3111.3939186731213,
 3043.5809350234513,
 2594.8842074426925,
 2381.311463228336,
 2637.109041513646,
 3277.033962471694,
 2379.012328234823,
 3119.0660623938065,
 2302.829178134419,
 3249.8959780738573,
 2949.554359045133,
 2856.3516598185233,
 2803.8196640845817,
 2785.3912398431853,
 3249.903575786185,
 2728.5320833767596,
 2859.0727626838484,
 3291.3359950705403,
 2788.348110238442,
 2837.771016540081,
 2964.038546727807,
 3056.9099831075564,
 2788.0881127033304,
 2677.82482419434,
 3185.2353113906165,
 2919.536009244991,
 2943.187208923401,
 2473.651797278677,
 2817.8997294928495,
 2431.9903199384385,
 2844.9589955242477,
 2970.4514733930514,
 2854.65138956302,
 3214.006787436642,
 2566.558818304848,
 3060.2343542737954,
 3347.4481296745707,
 2770.9505412382355,
 2819.0300865503377,
 2577.761249267312,
 3190.7460710942755,
 2933.419363572628,
 3229.488081387848,
 3122.3346073248717,
 3186.3073048882543,
 2778.3228122385694]
corr_matrix = df.corr()
plt.figure(figsize=(20, 15))  # Width x Height in inches
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', annot_kws={"size": 10}, cbar_kws={'label': 'Correlation'})
plt.show()

label_column = 'loss'

cont_feats =  ['cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14']

cat_feats= ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9']

feats = cont_feats + cat_feats

#df_encoded = pd.get_dummies(df[cat_feats])
#df_numeric = df[cont_feats]
df_combined = pd.concat((df[cat_feats], df[cont_feats]), axis=1)

df_combined[label_column] = df[label_column]

correlation_matrix = df_combined.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix[[label_column]].sort_values(by=label_column, ascending=False), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix for Loss Label')
plt.show()

# TRAINED LINEAR REGRESSION MODEL BASED ON THE TARGET ENCODED DATA
features = ['cat109', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116']

X = df[features]
y = df['loss']

print(X)
             cat109       cat110       cat111       cat112       cat113  \
0       2639.853403  2555.547630  3112.781574  3409.877970  2745.118673
1       3146.281369  3053.180857  2826.835029  2381.311463  2874.502783
2       2609.825032  2938.455639  2826.835029  3249.903576  2942.939477
3       3146.281369  3024.901680  3112.781574  3043.580935  2907.769445
4       2058.911903  2427.051291  3112.781574  2837.771017  2874.502783
...             ...          ...          ...          ...          ...
188313  3146.281369  3047.405943  2826.835029  3249.895978  2874.502783
188314  3146.281369  3180.011145  2826.835029  2381.311463  2601.824868
188315  3146.281369  3076.768862  2826.835029  3347.448130  2942.939477
188316  3146.281369  2786.703811  2826.835029  3409.877970  2923.442501
188317  3146.281369  3047.405943  5167.734712  3409.877970  3065.474265

             cat114       cat115       cat116
0       3259.907946  2948.542000  2917.710468
1       3259.907946  2948.542000  3107.659307
2       3259.907946  2991.051898  2696.415612
3       3259.907946  2948.542000  2911.931661
4       3259.907946  3016.759217  3037.328951
...             ...          ...          ...
188313  3259.907946  3054.090013  3037.328951
188314  2088.702603  3140.728386  2805.428945
188315  3259.907946  3016.759217  2911.931661
188316  3259.907946  2948.542000  3162.497643
188317  3259.907946  2948.542000  3798.173581

[188318 rows x 8 columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# Create the  LinearRegression model object
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

#  Make predictions on the test data
prediction = model.predict(X_test)
print('Model Summary:\n')

# Print intercept (alpha)
print('Intercept:')
print('alpha = ' , model.intercept_)

# Print weights
print('\nWeights:')
i = 0
for w in model.coef_:
    print('w_',i+1,'= ', w, ' [ weight of ', features[i],']')
    i += 1
Model Summary:

Intercept:
alpha =  -13616.105827116236

Weights:
w_ 1 =  0.4128165922689112  [ weight of  cat109 ]
w_ 2 =  0.2934223773862843  [ weight of  cat110 ]
w_ 3 =  0.9170956098793612  [ weight of  cat111 ]
w_ 4 =  0.8080673235397873  [ weight of  cat112 ]
w_ 5 =  0.5688995338963799  [ weight of  cat113 ]
w_ 6 =  0.9612427473241598  [ weight of  cat114 ]
w_ 7 =  0.83765695235362  [ weight of  cat115 ]
w_ 8 =  0.6851911572023519  [ weight of  cat116 ]
Looking at the data, cat111, cat112, cat114, and cat115 has the highest weights from cat 109-116.
# Print mean squared error
print('\nModel Performance\n\nRMSE =   %.2f'
      % np.sqrt(mean_squared_error(y_test, prediction)))
# The coefficient of determination: 1 is perfect prediction
print(' R^2 =   %.2f'
      % r2_score(y_test, prediction))
Model Performance

RMSE =   2702.81
 R^2 =   0.12
Our RMSE score is very high, meaning that the predicted values and real values have a significant difference. R^2 is also very low, suggesting that there
is no real value between the features and label.

#TRAINING LINEAR REGRESSION MODEL ON HIGHEST WEIGHTED CATEGORY FROM BEFORE: CAT114
X = df['cat114'].to_frame()
y = df['loss']

print(X)
             cat114
0       3259.907946
1       3259.907946
2       3259.907946
3       3259.907946
4       3259.907946
...             ...
188313  3259.907946
188314  2088.702603
188315  3259.907946
188316  3259.907946
188317  3259.907946

[188318 rows x 1 columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# Create the  LinearRegression model object
model2 = LinearRegression()

# Fit the model to the training data
model2.fit(X_train, y_train)

#  Make predictions on the test data
prediction2 = model2.predict(X_test)
# Weight_1 (weight of feature LogGDP)
print('Model Summary\n\nWeight_1 =  ', model2.coef_[0])
# alpha
print('Alpha = ', model2.intercept_, '[ intercept ]')
Model Summary

Weight_1 =   1.0154671486187499 [ weight of feature LogGDP ]
Alpha =  -45.11987339779034 [ intercept ]
#The mean squared error
print('\nModel Performance\n\nRMSE =   %.2f'
      % np.sqrt(mean_squared_error(y_test, prediction2)))
# The coefficient of determination: 1 is perfect prediction
print(' R^2 =   %.2f'
      % r2_score(y_test, prediction2))
Model Performance

RMSE =   2829.83
 R^2 =   0.03
R^2 decreased even more when trained with only cat114 as the feature.


# TRAINING MODEL USING ONLY THE COLUMNS THAT HAVE A OR B
cols_with_A_or_B = df.columns[df.apply(lambda col: col.isin(['A', 'B']).all())]

# Create a new DataFrame with only the extracted columns
df_AB = df[cols_with_A_or_B]
df_AB = df_AB.replace({'A': 1, 'B': 0})

print(df_AB)
C:\Users\shimr\AppData\Local\Temp\ipykernel_5864\1011918628.py:5: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  df_AB = df_AB.replace({'A': 1, 'B': 0})
        cat1  cat2  cat3  cat4  cat5  cat6  cat7  cat8  cat9  cat10  ...  \
0          1     0     1     0     1     1     1     1     0      1  ...
1          1     0     1     1     1     1     1     1     0      0  ...
2          1     0     1     1     0     1     1     1     0      0  ...
3          0     0     1     0     1     1     1     1     0      1  ...
4          1     0     1     0     1     1     1     1     0      0  ...
...      ...   ...   ...   ...   ...   ...   ...   ...   ...    ...  ...
188313     1     0     1     1     1     1     1     1     0      1  ...
188314     1     1     1     1     1     0     1     1     1      1  ...
188315     1     0     1     1     1     1     1     0     0      1  ...
188316     1     0     1     1     1     1     1     1     0      0  ...
188317     0     1     1     0     1     1     1     1     1      1  ...

        cat63  cat64  cat65  cat66  cat67  cat68  cat69  cat70  cat71  cat72
0           1      1      1      1      1      1      1      1      1      1
1           1      1      1      1      1      1      1      1      1      1
2           1      1      1      1      1      1      1      1      1      1
3           1      1      1      1      1      1      1      1      1      1
4           1      1      1      1      1      1      1      1      1      0
...       ...    ...    ...    ...    ...    ...    ...    ...    ...    ...
188313      1      1      1      1      1      1      1      1      1      0
188314      1      1      1      1      1      1      1      1      1      1
188315      1      1      1      0      1      1      1      1      1      1
188316      1      1      1      1      1      1      1      1      1      1
188317      1      1      1      1      1      1      1      1      1      0

[188318 rows x 72 columns]
df_AB['loss'] = df['loss']
#df_ABfeatures = df_AB.columns
X = df_AB.columns
y = df['loss']

print(X)
Index(['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9',
       'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17',
       'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25',
       'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33',
       'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41',
       'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49',
       'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57',
       'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65',
       'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'loss'],
      dtype='object')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[62], line 1
----> 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\utils\_param_validation.py:213, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    207 try:
    208     with config_context(
    209         skip_parameter_validation=(
    210             prefer_skip_nested_validation or global_skip_validation
    211         )
    212     ):
--> 213         return func(*args, **kwargs)
    214 except InvalidParameterError as e:
    215     # When the function is just a wrapper around an estimator, we allow
    216     # the function to delegate validation to the estimator, but we replace
    217     # the name of the estimator by the name of the function in the error
    218     # message to avoid confusion.
    219     msg = re.sub(
    220         r"parameter of \w+ must be",
    221         f"parameter of {func.__qualname__} must be",
    222         str(e),
    223     )

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_split.py:2777, in train_test_split(test_size, train_size, random_state, shuffle, stratify, *arrays)
   2774 if n_arrays == 0:
   2775     raise ValueError("At least one array required as input")
-> 2777 arrays = indexable(*arrays)
   2779 n_samples = _num_samples(arrays[0])
   2780 n_train, n_test = _validate_shuffle_split(
   2781     n_samples, test_size, train_size, default_test_size=0.25
   2782 )

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\utils\validation.py:514, in indexable(*iterables)
    484 """Make arrays indexable for cross-validation.
    485
    486 Checks consistent length, passes through None, and ensures that everything
   (...)
    510 [[1, 2, 3], array([2, 3, 4]), None, <3x1 sparse matrix ...>]
    511 """
    513 result = [_make_indexable(X) for X in iterables]
--> 514 check_consistent_length(*result)
    515 return result

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\utils\validation.py:457, in check_consistent_length(*arrays)
    455 uniques = np.unique(lengths)
    456 if len(uniques) > 1:
--> 457     raise ValueError(
    458         "Found input variables with inconsistent numbers of samples: %r"
    459         % [int(l) for l in lengths]
    460     )

ValueError: Found input variables with inconsistent numbers of samples: [73, 188318]


Bode Final Model

In [None]:

#Data Processing
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

#XGBoost Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
#Error Metrics
from sklearn.metrics import mean_squared_error, r2_score
#HyperParameter Tuning
from sklearn.model_selection import GridSearchCV
# Perform K-Fold Cross-Validation
from sklearn.model_selection import cross_val_score
#Random Distribution
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import uniform, randint
#Importance Plot
from xgboost import plot_importance
df = pd.read_csv("/Users/bodechiu/Desktop/Break Through Tech AI/AI Studio Project/claims_data.csv")
df_numeric = df.select_dtypes(include=[np.number])
corrMatrix = df_numeric.corr()
plt.figure(figsize=(20, 15))  # Width x Height in inches
sns.heatmap(corrMatrix, annot=True, fmt='.2f', cmap='coolwarm', annot_kws={"size": 10}, cbar_kws={'label': 'Correlation'})
plt.show()

# Assuming 'df' is your DataFrame
categorical_columns = [f'cat{i}' for i in range(1, 117)]
continuous_columns = [f'cont{i}' for i in range(1, 15)]
target_column = 'loss'

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = encoder.fit_transform(df[categorical_columns])

# Get the feature names from the encoder and organize them
# The feature names generated will include the original column and category
categories = encoder.categories_
organized_feature_names = []

for col_index, col_name in enumerate(categorical_columns):
    for cat_index, category in enumerate(categories[col_index][1:]):  # Skip the first category due to 'drop=first'
        organized_feature_names.append(f'{col_name}_{category}')

# Convert the encoded categorical data to a DataFrame with the organized feature names
encoded_df = pd.DataFrame(encoded_categorical, columns=organized_feature_names)

# Combine the continuous columns and the one-hot encoded categorical columns
X = pd.concat([encoded_df, df[continuous_columns]], axis=1)
y = df[target_column]


# Step 1: Create a new feature with uniform distribution
np.random.seed(42)  # For reproducibility
X['uniform_feature'] = np.random.uniform(0, 1, X.shape[0])  # Add a uniform feature

# Step 2: Split the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the model with the new feature
best_params = {
    'colsample_bytree': 0.8998609717152555,
    'learning_rate': 0.019333132642723086,
    'max_depth': 6,
    'n_estimators': 882,
    'subsample': 0.9560699842170359,
    'eval_metric': 'rmse'  # Add evaluation metric for consistency
}
model = xgb.XGBRegressor(**best_params)
model.fit(X_train, y_train)

# Step 4: Plot feature importance
plt.figure(figsize=(10, 8))
plot_importance(model, max_num_features=20, importance_type="weight")  # Adjust max_num_features as needed
plt.title("XGBoost Feature Importance")
plt.show()

# Extract feature importances
importances = model.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]
features = X.columns

# Identify threshold using the importance of the uniform feature
uniform_feature_importance = importances[features.get_loc('uniform_feature')]

# Step 5: Drop all features that fall below the value for this new feature
important_features = features[importances >= uniform_feature_importance]
X_train_filtered = X_train[important_features]
X_test_filtered = X_test[important_features]

# Retrain the model with filtered features
filtered_model = xgb.XGBRegressor(**best_params)
filtered_model.fit(X_train_filtered, y_train)

# Step 6: Make predictions using the model with filtered features
y_pred_filtered = filtered_model.predict(X_test_filtered)

# Evaluate the model using Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R²
mse_filtered = mean_squared_error(y_test, y_pred_filtered)
rmse_filtered = np.sqrt(mse_filtered)
r2_filtered = r2_score(y_test, y_pred_filtered)

# Calculate the variance of the target variable in the test set
variance = y_test.var()

# Normalize the MSE by dividing it by the variance of the target variable
normalized_mse = mse_filtered / variance

print(f"Mean Squared Error (Filtered): {mse_filtered}")
print(f"Normalized Mean Squared Error: {normalized_mse}")
print(f"Root Mean Squared Error (Filtered): {rmse_filtered}")
print(f"R² (Filtered): {r2_filtered}")


# Step 7: Look at data where the prediction is "good"
residuals = np.abs(y_test - y_pred_filtered)
threshold = residuals.mean()  # You could set a threshold based on the mean of residuals

# Display data points where residuals are below the threshold (good predictions)
good_predictions = X_test_filtered[residuals < threshold]
print("Data points with good predictions:")
print(good_predictions)

# Step 8: Look at top predictors
top_predictors = important_features[np.argsort(importances[importances >= uniform_feature_importance])[::-1]]
print("Top predictors based on filtered model:")
print(top_predictors)
<Figure size 1000x800 with 0 Axes>

Mean Squared Error (Filtered): 3432087.742558672
Normalized Mean Squared Error: 0.4206084507157916
Root Mean Squared Error (Filtered): 1852.5894695152167
R² (Filtered): 0.5793803816010521
Data points with good predictions:
        cat1_B  cat2_B  cat4_B  cat5_B  cat6_B  cat7_B  cat9_B  cat10_B  \
10168      0.0     1.0     1.0     0.0     0.0     0.0     1.0      0.0
6936       0.0     0.0     0.0     0.0     1.0     0.0     0.0      0.0
117312     1.0     1.0     0.0     0.0     0.0     0.0     1.0      0.0
70551      0.0     0.0     0.0     1.0     0.0     0.0     0.0      0.0
97059      0.0     0.0     1.0     0.0     0.0     0.0     0.0      0.0
...        ...     ...     ...     ...     ...     ...     ...      ...
88497      1.0     0.0     0.0     1.0     0.0     0.0     0.0      0.0
56217      0.0     1.0     1.0     0.0     0.0     0.0     1.0      0.0
162470     0.0     1.0     1.0     0.0     0.0     0.0     1.0      1.0
114899     0.0     0.0     0.0     0.0     0.0     0.0     0.0      0.0
35726      1.0     0.0     0.0     0.0     1.0     0.0     0.0      0.0

        cat11_B  cat12_B  ...     cont6     cont7    cont8    cont9   cont10  \
10168       0.0      0.0  ...  0.817706  0.480176  0.90055  0.84847  0.80218
6936        0.0      0.0  ...  0.687443  0.499166  0.54236  0.50420  0.51111
117312      0.0      0.0  ...  0.394921  0.374803  0.24564  0.46853  0.50556
70551       0.0      0.0  ...  0.446460  0.459649  0.69840  0.39447  0.46119
97059       0.0      0.0  ...  0.344288  0.354907  0.95332  0.33051  0.31480
...         ...      ...  ...       ...       ...      ...      ...      ...
88497       0.0      0.0  ...  0.439206  0.415949  0.45883  0.46853  0.52221
56217       0.0      0.0  ...  0.925649  0.604335  0.96843  0.93383  0.83510
162470      1.0      1.0  ...  0.303881  0.356898  0.36083  0.36091  0.36458
114899      0.0      0.0  ...  0.419901  0.416308  0.72775  0.37458  0.40666
35726       0.0      0.0  ...  0.384572  0.296392  0.92347  0.48530  0.29134

          cont11    cont12    cont13    cont14  uniform_feature
10168   0.550529  0.590961  0.814573  0.721577         0.498151
6936    0.698978  0.685713  0.695650  0.721610         0.317259
117312  0.453334  0.443374  0.263731  0.757043         0.327384
70551   0.307628  0.305148  0.660756  0.820556         0.736106
97059   0.291268  0.286079  0.486670  0.411261         0.697651
...          ...       ...       ...       ...              ...
88497   0.441763  0.443374  0.324464  0.833359         0.017276
56217   0.826362  0.891717  0.866072  0.286959         0.717146
162470  0.225753  0.222634  0.261150  0.314630         0.113323
114899  0.341813  0.335036  0.579324  0.351410         0.760790
35726   0.359572  0.352251  0.566274  0.361131         0.173213

[25044 rows x 322 columns]
Top predictors based on filtered model:
Index(['cat80_D', 'cat80_B', 'cat57_B', 'cat79_D', 'cat12_B', 'cat113_U',
       'cat105_P', 'cat87_B', 'cat81_D', 'cat106_C',
       ...
       'cat104_C', 'cat66_B', 'cat112_F', 'cat116_CB', 'cat116_HB', 'cat100_B',
       'cat110_N', 'cat116_LQ', 'cat112_AR', 'uniform_feature'],
      dtype='object', length=322)
# Plot Predicted vs Actual values for visualization with filtered model
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_filtered, alpha=0.5, label="Predicted vs Actual (Filtered)")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label="Ideal Fit Line")
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Predicted vs Actual Values (Filtered Model)')
plt.legend()
plt.show()

# Extract and print the top 20 most important features from the feature importance chart
top_20_indices = np.argsort(importances)[::-1][:50]  # Get the top 20 indices
important_features = features[top_20_indices]  # Select the top 20 features
print("Top 50 most important features:")
print(important_features)
Top 50 most important features:
Index(['cat80_D', 'cat80_B', 'cat57_B', 'cat79_D', 'cat12_B', 'cat113_U',
       'cat105_P', 'cat87_B', 'cat81_D', 'cat106_C', 'cat116_KP', 'cat107_D',
       'cat10_B', 'cont7', 'cont12', 'cat115_E', 'cont2', 'cat109_BI',
       'cat53_B', 'cat76_B', 'cat72_B', 'cat113_G', 'cat7_B', 'cat26_B',
       'cat116_J', 'cat38_B', 'cat44_B', 'cat61_B', 'cat100_I', 'cat113_AL',
       'cat11_B', 'cat82_D', 'cat1_B', 'cat13_B', 'cat100_G', 'cat113_AT',
       'cat116_U', 'cat105_T', 'cat27_B', 'cat109_AB', 'cat52_B', 'cat9_B',
       'cat49_B', 'cat108_K', 'cat116_W', 'cat115_H', 'cat91_C', 'cat79_B',
       'cat37_B', 'cat103_G'],
      dtype='object')
# Function to train and evaluate the model using a subset of top features
def train_and_evaluate_model(top_n):
    # Select the top 'top_n' features
    selected_features = features[indices[:top_n]]
    X_train_top = X_train[selected_features]
    X_test_top = X_test[selected_features]

    # Train the model with the selected features
    model_top = xgb.XGBRegressor(**best_params)
    model_top.fit(X_train_top, y_train)

    # Make predictions using the model with the selected features
    y_pred_top = model_top.predict(X_test_top)

    # Evaluate the model
    mse_top = mean_squared_error(y_test, y_pred_top)
    rmse_top = np.sqrt(mse_top)
    r2_top = r2_score(y_test, y_pred_top)

    print(f"Performance with top {top_n} features:")
    print(f"Mean Squared Error: {mse_top}")
    print(f"Root Mean Squared Error: {rmse_top}")
    print(f"R²: {r2_top}")
    print("\n")

# Retrain and evaluate the model with top 20, top 50, top 10, and top 5 features
train_and_evaluate_model(50)
train_and_evaluate_model(20)
train_and_evaluate_model(10)
train_and_evaluate_model(5)


Entire Thing