# Activity

### (Tuesday)

## Linear Regression

- X-y split (y is the target variable, in this case, "total claim amount")
- Train-test split.
- Standardize the data (after the data split!).
- Apply linear regression.
- Model Interpretation.


### (Wednesday)

## Model Validation

- Model Evaluation:
    MSE.
    RMSE.
    MAE.
    R2.
    Adjusted R2.

- Feature Importance.

### Model Iteration (Wednesday and Thursday)

Rerun the model after adding the hot encoded categorical variables as well as other numeric 
categorical variables (e.g. number of open complaintes).


(Optional) Rerun the model after removing the outliers and compare the results using the R2.

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import math 

In [2]:
# Load dataset from an csv file

df = pd.read_csv('marketing_customer_analysis_clean.csv')
df.head()

Unnamed: 0,unnamed:_0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,...,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,vehicle_type,month
0,0,DK49336,Arizona,4809.21696,No,Basic,College,2011-02-18,Employed,M,...,9,Corporate Auto,Corporate L3,Offer3,Agent,292.8,Four-Door Car,Medsize,A,2
1,1,KX64629,California,2228.525238,No,Basic,College,2011-01-18,Unemployed,F,...,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,A,1
2,2,LZ68649,Washington,14947.9173,No,Basic,Bachelor,2011-02-10,Employed,M,...,2,Personal Auto,Personal L3,Offer3,Call Center,480.0,SUV,Medsize,A,2
3,3,XL78013,Oregon,22332.43946,Yes,Extended,College,2011-01-11,Employed,M,...,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A,1
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,2011-01-17,Medical Leave,F,...,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,A,1


In [3]:
df = df.dropna()

In [4]:
# X-y split (y is the target variable, in this case, "total claim amount") ### numerical variables 

X = df[['customer_lifetime_value', 'income', 'monthly_premium_auto', 'months_since_last_claim', 
        'months_since_policy_inception', 'number_of_open_complaints', 'number_of_policies', 'month']]
y = df[['total_claim_amount']]

In [5]:
X

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month
0,4809.216960,48029,61,7.000000,52,0.000000,9,2
1,2228.525238,0,64,3.000000,26,0.000000,1,1
2,14947.917300,22139,100,34.000000,31,0.000000,2,2
3,22332.439460,49078,97,10.000000,3,0.000000,2,1
4,9025.067525,23675,117,15.149071,31,0.384256,7,1
...,...,...,...,...,...,...,...,...
10905,15563.369440,0,253,15.149071,40,0.384256,7,1
10906,5259.444853,61146,65,7.000000,68,0.000000,6,1
10907,23893.304100,39837,201,11.000000,63,0.000000,2,2
10908,11971.977650,64195,158,0.000000,27,4.000000,6,2


In [6]:
y

Unnamed: 0,total_claim_amount
0,292.800000
1,744.924331
2,480.000000
3,484.013411
4,707.925645
...,...
10905,1214.400000
10906,273.018929
10907,381.306996
10908,618.288849


In [7]:
# Train-test split

from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 100)

In [11]:
# Standardize the data (after the data split!)

In [12]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer

scaler = StandardScaler()

In [13]:
# fit and scale for X_train

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_train_scaled

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month
0,-0.298649,0.352458,-0.556749,0.007560,-1.468668,0.007752,0.415700,-0.940879
1,-0.392857,-1.239629,-0.556749,0.402051,0.821920,-0.429234,-0.000509,-0.940879
2,-0.821583,-0.504792,-0.817683,0.094730,1.609310,-0.429234,-0.832927,-0.940879
3,2.879646,-1.239629,0.544971,0.811813,-1.003392,-0.429234,-0.416718,1.062836
4,1.002198,1.985075,-0.585742,2.041098,-0.430745,-0.429234,-0.416718,1.062836
...,...,...,...,...,...,...,...,...
8177,-0.809832,0.340586,-0.904661,-1.339436,0.106112,-0.429234,-0.832927,1.062836
8178,0.201285,1.266199,0.631949,-0.417472,-0.144422,-0.429234,-0.000509,1.062836
8179,-0.455121,0.985263,-0.933653,-1.134555,-0.931811,-0.429234,2.496746,1.062836
8180,1.953516,-1.239629,0.429000,-0.007711,-1.110764,-0.429234,-0.416718,-0.940879


In [14]:
# only scale for X_test, we dont fit them!!

scaler.transform(X_test)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns = X_test.columns)
X_test_scaled

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,month
0,-0.760433,-0.356162,-0.643727,-1.134555,-0.251793,-0.429234,-0.832927,1.062836
1,-0.790735,-0.560557,-0.643727,-0.007711,1.179825,0.707991,-0.832927,1.062836
2,-0.073986,-1.239629,0.400008,-0.929675,0.535597,-0.429234,2.080537,-0.940879
3,1.261077,0.403474,-0.672720,-0.110151,-0.752859,2.982439,-0.416718,1.062836
4,0.183538,-0.006667,0.689934,1.528896,-0.967602,-0.429234,0.831910,1.062836
...,...,...,...,...,...,...,...,...
2723,1.298055,-0.220130,-0.614734,-1.441877,-1.504458,1.845215,-0.416718,-0.940879
2724,-0.426642,-0.280149,0.979860,-1.339436,1.144034,-0.429234,-0.832927,-0.940879
2725,-0.181434,-1.239629,0.052096,-0.007711,-1.432878,-0.429234,0.831910,1.062836
2726,-0.593347,1.812075,0.197059,-1.441877,0.249273,-0.429234,-0.832927,1.062836


In [15]:
# Apply linear regression.

from sklearn.linear_model import LinearRegression

In [16]:
lm = LinearRegression()

In [17]:
# to create the model !!!!!!!

model = lm.fit(X_train_scaled, y_train)

In [18]:
coefficients = model.coef_

In [19]:
coefficients

array([[  -4.01892081, -101.7096646 ,  181.59461815,   -1.35710396,
          -4.9644593 ,   -2.41851184,    0.42741983,   -0.85320728]])

In [20]:
model.intercept_

array([434.06224734])

In [21]:
# Model Interpretation

y_pred = model.predict(X_test_scaled)

In [22]:
y_test.reset_index(inplace=True)

In [24]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,359.011139
1,368.534973
2,634.413943
3,261.383580
4,562.506569
...,...
2723,345.189930
2724,639.831179
2725,577.944843
2726,288.420724


In [25]:
y_pred.shape

(2728, 1)

In [26]:
real_vs_pred = pd.DataFrame({'y_test': y_test['total_claim_amount'], 'y_pred': y_pred[:,0]})

In [27]:
real_vs_pred

Unnamed: 0,y_test,y_pred
0,340.800000,359.011139
1,195.153339,368.534973
2,770.400000,634.413943
3,336.000000,261.383580
4,561.600000,562.506569
...,...,...
2723,345.600000,345.189930
2724,664.280864,639.831179
2725,684.000000,577.944843
2726,25.713622,288.420724


In [28]:
### Linear Regression using statsmodels

import statsmodels.api as sm

X_train_const = sm.add_constant(X_train.to_numpy())

model = sm.OLS(y_train, X_train_const).fit()
predictions_train = model.predict(X_train_const)

X_test_const = sm.add_constant(X_test)
y_pred = model.predict(X_test_const)
print_model = model.summary()
print_model

0,1,2,3
Dep. Variable:,total_claim_amount,R-squared:,0.515
Model:,OLS,Adj. R-squared:,0.515
Method:,Least Squares,F-statistic:,1087.0
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,0.0
Time:,20:49:56,Log-Likelihood:,-55007.0
No. Observations:,8182,AIC:,110000.0
Df Residuals:,8173,BIC:,110100.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,87.8129,11.296,7.774,0.000,65.670,109.956
x1,-0.0006,0.000,-1.638,0.101,-0.001,0.000
x2,-0.0034,7.34e-05,-45.677,0.000,-0.003,-0.003
x3,5.2649,0.071,74.113,0.000,5.126,5.404
x4,-0.1390,0.228,-0.609,0.543,-0.586,0.308
x5,-0.1777,0.080,-2.229,0.026,-0.334,-0.021
x6,-2.7504,2.532,-1.086,0.277,-7.713,2.212
x7,0.1779,0.927,0.192,0.848,-1.639,1.995
x8,-1.7096,4.460,-0.383,0.701,-10.452,7.033

0,1,2,3
Omnibus:,725.454,Durbin-Watson:,1.993
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4004.761
Skew:,0.235,Prob(JB):,0.0
Kurtosis:,6.395,Cond. No.,255000.0
