In [1]:
# lab-model-generation-and-validation


In [2]:
import pandas as pd
import numpy as np
import math
from scipy import stats
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

In [3]:
data = pd.read_csv("marketing_customer_analysis.csv")
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [4]:
numerical = data.select_dtypes(include=np.number)

categoricals = data.select_dtypes(include=np.object)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categoricals = data.select_dtypes(include=np.object)


In [5]:
correlation_matrix = numerical.corr().abs()
upper_triangle = np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
high_correlation = correlation_matrix.where(upper_triangle)
high_correlation_pairs = high_correlation.stack().reset_index()
high_correlation_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']

high_correlation_pairs = high_correlation_pairs[high_correlation_pairs['Correlation'] > 0.9]

if not high_correlation_pairs.empty:
    least_correlated = high_correlation_pairs[
        high_correlation_pairs['Feature 1'] != 'total_claim_amount'
    ].sort_values(by='Correlation', ascending=False).iloc[0]['Feature 1']

    numerical.drop(least_correlated, axis=1, inplace=True)

In [6]:
X = data.drop('Total Claim Amount', axis=1)
y = data['Total Claim Amount']


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

In [9]:
numerical_cols = X_train.select_dtypes(include=[np.number]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

X_train_numerical = X_train_df[numerical_cols]
X_train_categorical = X_train_df[categorical_cols]

In [10]:
from sklearn.preprocessing import MinMaxScaler
import pickle
import os

scaler = MinMaxScaler()
X_train_numerical_scaled = scaler.fit_transform(X_train_numerical)

In [11]:
path = "transformers/"
filename = "numerical_transformer.pkl"
with open(path + filename, "wb") as file:
    pickle.dump(filename, file)

In [12]:
X_train_numerical_scaled_df = pd.DataFrame(X_train_numerical_scaled, columns=numerical_cols)

In [13]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown="ignore")
X_train_categorical_encoded = encoder.fit_transform(X_train_categorical)
#categorical_transformer = OneHotEncoder()
#X_train_categorical_encoded = categorical_transformer.fit_transform(X_train_categorical)

In [14]:
path = "encoders/"
filename = "categorical_transformer.pkl"
with open(path + filename, "wb") as file:
    pickle.dump(filename, file)

In [15]:
categorical_transformer = OneHotEncoder()
X_train_categorical_encoded = categorical_transformer.fit_transform(X_train_categorical)


In [16]:
categorical_cols_list = categorical_cols.tolist()

encoded_columns = []
for feature in categorical_cols_list:
    categories = categorical_transformer.categories_[categorical_cols_list.index(feature)]
    encoded_columns.extend([f"{feature}_{category}" for category in categories])

X_train_categorical_encoded_df = pd.DataFrame(X_train_categorical_encoded.toarray(), columns=encoded_columns)

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

final_scaler = MinMaxScaler()
X_train_final_scaled = final_scaler.fit_transform(X_train_numerical_scaled_df)

X_train_transformed = np.concatenate((X_train_final_scaled, X_train_categorical_encoded_df), axis=1)

model = LinearRegression()
model.fit(X_train_transformed, y_train)

y_pred_train = model.predict(X_train_transformed)


In [18]:
MAE_train = mean_absolute_error(y_train, y_pred_train)
MSE_train = mean_squared_error(y_train, y_pred_train)
RMSE_train = mean_squared_error(y_train, y_pred_train, squared=False)
R2_train = r2_score(y_train, y_pred_train)

In [19]:
print("Train Set Metrics:")
print("MAE:", MAE_train)
print("MSE:", MSE_train)
print("RMSE:", RMSE_train)
print("R2:", R2_train)

Train Set Metrics:
MAE: 3.900955663218672e-12
MSE: 8.760995413725546e-23
RMSE: 9.360018917569315e-12
R2: 1.0
