In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
## Install xlrd package to load Excel files
# conda install openpyxl
## conda install xlrd
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder

In [2]:
RAND_STATE = 34 # for reproducible shuffling
TT_RATIO = 0.3 # test/train

In [3]:
#Load the dataframe into a variable
df = pd.read_csv('Data/Data_Marketing_Customer_Analysis_Round3.csv')
df.columns

Index(['region', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'month', 'employment_status',
       'gender', 'income', 'location_code', 'marital_status',
       'monthly_premium_auto', 'months_since_last_claim',
       'months_since_policy_inception', 'number_of_open_complaints',
       'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',
       'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size'],
      dtype='object')

## Correlation Heatmap

In [None]:
corr = df.corr()

In [None]:
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns, 
        annot=True, cmap='Reds')

## X/Y Split

In [None]:
X = df.drop('total_claim_amount', axis=1)
y = df['total_claim_amount']

## Numerical/Categorical Split

In [None]:
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

In [None]:
numericalX.columns

 ## Test/Train Split

In [None]:
# test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train.head(3)

## Standardization

In [None]:
pt = PowerTransformer()

In [None]:
ct = ColumnTransformer([("pt", pt, list(numericalX.columns))],
                        remainder='drop',verbose_feature_names_out=True,verbose=True).fit(X_train)
X_train_ct = pd.DataFrame(ct.transform(X_train),columns=ct.get_feature_names_out())
X_test_ct = pd.DataFrame(ct.transform(X_test),columns=ct.get_feature_names_out())

In [None]:
eqution = 'y~pt__customer_lifetime_value+pt__income+pt__monthly_premium_auto+pt__months_since_last_claim+pt__months_since_policy_inception+pt__number_of_open_complaints+pt__number_of_policies'

In [None]:
X_train = pd.DataFrame(X_train_ct)
X_test_ct

## OLS Modelling

In [None]:
from statsmodels.formula.api import ols

In [None]:
X_train_const_ct = sm.add_constant(X_train_ct.to_numpy()) # adding a constant

model = sm.OLS(y_train, X_train_const_ct).fit()
predictions_train = model.predict(X_train_const_ct)

X_test_const_ct = sm.add_constant(X_test_ct) # adding a constant
predictions_test = model.predict(X_test_const_ct)
print_model = model.summary()
print(print_model)

There is a significant relationship between dependent variable which is Total Claim Amount and two independent variables which are Income(t=-11.917, p<0.05) and Monthly premium auto(t=48.426, p<0.05).

In [None]:
#smodel=ols(formula=eqution,data=X_train_const_ct).fit()
#smodel.summary()

In [None]:
plt.scatter(numericalX['income'], y)
plt.title('Relationship between Total Claim Amount and Income')
plt.xlabel('Income')
plt.ylabel('Total Claim Amount')
plt.show()

In [None]:
plt.scatter(numericalX['monthly_premium_auto'], y)
plt.title('Relationship between Total Claim Amount and Monthly Premium Auto')
plt.xlabel('Monthly Premium Auto')
plt.ylabel('Total Claim Amount')
plt.show()

In [None]:
model=LinearRegression()    # model
model.fit(X_train_ct, y_train)   # model train

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
y_pred = pd.DataFrame(model.predict(X_test_ct),columns = ['target_d'] )      # model prediction
y_pred_train =  pd.DataFrame(model.predict(X_train_ct),columns = ['target_d'])

## Model Performance & Error Evaluation

In [None]:
print(mse(y_test,predictions_test))
print(mae(y_test,predictions_test))
##prediction on the train set
print(mse(y_train,predictions_train))

In [None]:
# Make an scatter plot y_pred vs y
# What kind of plot you will get if all the all the predictions are ok?
# A stright line

fig, ax = plt.subplots(1,3,figsize=(14,4))
ax[0].plot(predictions_test, y_test, 'o')
ax[0].set_xlabel("y_test")
ax[0].set_ylabel("y_pred")
ax[0].set_title("Test Set -Predicted vs real")

# Get a histogram of the residuals ie: y - y_pred.  Homoscdasticity
# It resembles a normal distribution?
ax[1].hist(y_test - predictions_test)
ax[1].set_xlabel("Test y-y_pred")
ax[1].set_title("Test Set Residual histogram")

ax[2].plot(predictions_test,predictions_test.to_numpy()-predictions_test.to_numpy(),"o")
ax[2].set_xlabel("predited")
ax[2].set_ylabel("residuals")
ax[2].set_title("Residuals by Predicted")
ax[2].plot(predictions_test,np.zeros(len(predictions_test)),linestyle='dashed')

In [None]:
plt.scatter(y_test, predictions_test)
plt.show()

In [None]:
result=pd.DataFrame({"y_test": list(y_test),"y_pred": list(predictions_test)})

In [None]:
sns.regplot(x='y_test',y='y_pred', data=result, scatter_kws={"color": "red"}, line_kws={"color": "black"})

## Feature Importances

In [None]:
features_importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': abs(model.coef_.reshape(len(X_train.columns),))
})
features_importances = features_importances.sort_values(by='Importance', ascending=False)
features_importances

In [None]:
plt.bar(x=features_importances['Attribute'].iloc[:10], height=features_importances['Importance'].iloc[:10], color='#087E8B')
plt.title('Feature importance rankings', size=12)
plt.xticks(rotation='vertical')
plt.show()

## One-Hot Encoding for categorical variables

In [None]:
df

In [None]:
df = df.drop('effective_to_date', axis=1)

In [None]:
#education
ed_map = {"high school or below": 0, "college":1, "bachelor": 1,"master":2, "doctor":3}
ed_map
df['education'] = df['education'].map(ed_map)

In [None]:
df['vehicle_size'].unique()

In [None]:
#employment
ep_map = {"unemployed": 0, "employed":1, 'medical leave':2,'disabled':3, 'retired':4 }
ep_map
df['employment_status'] = df['employment_status'].map(ep_map)

In [None]:
#region
rg_map = {'central':0, 'west region':1, 'east':2, 'north west':3 }
rg_map
df['region'] = df['region'].map(rg_map)

In [None]:
#response
rp_map = {'no':0, 'yes':1}
rp_map
df['response'] = df['response'].map(rp_map)

In [None]:
#sales channel
sc_map = {'agent':0, 'call center':1, 'branch':2, 'web':3}
sc_map
df['sales_channel'] = df['sales_channel'].map(sc_map)

In [None]:
#vehicle_class
vc_map = {'four-door car':0, 'suv':1, 'two-door car':2, 'sports car':3, 'luxury car':4,
       'luxury suv':5}
vc_map
df['vehicle_class'] = df['vehicle_class'].map(vc_map)

In [None]:
#vehicle_size
vs_map = {'medsize':0, 'small':1, 'large':2}
vs_map
df['vehicle_size'] = df['vehicle_size'].map(vs_map)

In [None]:
#vehicle_size
vs_map = {'medsize':0, 'small':1, 'large':2}
vs_map
df['vehicle_size'] = df['vehicle_size'].map(vs_map)

In [None]:
df_dummies = pd.get_dummies(df['gender'], sparse=1,drop_first=1)

In [None]:
df

In [None]:
X = df.drop('total_claim_amount', 'coverage','month','gender', 'policy_type', 'policy', 'renew_offer_type)
y = df['total_claim_amount']

In [None]:
# test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_RATIO, random_state=RAND_STATE)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
X_train.head(3)

In [None]:

X_train_ct = pd.DataFrame(ct.transform(X_train),columns=ct.get_feature_names_out())
X_test_ct = pd.DataFrame(ct.transform(X_test),columns=ct.get_feature_names_out())

In [None]:
X_train = pd.DataFrame(X_train_ct)
X_test_ct

In [None]:
from statsmodels.formula.api import ols

In [None]:
X_train_const_ct = sm.add_constant(X_train_ct.to_numpy()) # adding a constant

model = sm.OLS(y_train, X_train_const_ct).fit()
predictions_train = model.predict(X_train_const_ct)

X_test_const_ct = sm.add_constant(X_test_ct) # adding a constant
predictions_test = model.predict(X_test_const_ct)
print_model = model.summary()
print(print_model)