In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

## X-y Split


In [3]:
df1 = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv', )

In [4]:
df1.columns= df1.columns.str.lower().str.replace(' ','_')

In [5]:
df1_num= df1.select_dtypes('number')
df1_num.columns

Index(['customer_lifetime_value', 'income', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies',
       'total_claim_amount'],
      dtype='object')

In [6]:
df1_num

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies,total_claim_amount
0,2763.519279,56274,69,32,5,0,1,384.811147
1,6979.535903,0,94,13,42,0,8,1131.464935
2,12887.431650,48767,108,18,38,0,2,566.472247
3,7645.861827,0,106,18,65,0,7,529.881344
4,2813.692575,43836,73,12,44,0,1,138.130879
...,...,...,...,...,...,...,...,...
9129,23405.987980,71941,73,18,89,0,2,198.234764
9130,3096.511217,21604,79,14,28,0,1,379.200000
9131,8163.890428,0,85,9,37,3,2,790.784983
9132,7524.442436,21941,96,34,3,0,3,691.200000


In [7]:
y=df1_num['total_claim_amount']
y

0        384.811147
1       1131.464935
2        566.472247
3        529.881344
4        138.130879
           ...     
9129     198.234764
9130     379.200000
9131     790.784983
9132     691.200000
9133     369.600000
Name: total_claim_amount, Length: 9134, dtype: float64

In [8]:
x= df1_num.drop(['total_claim_amount'] , axis=1)
x

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,2763.519279,56274,69,32,5,0,1
1,6979.535903,0,94,13,42,0,8
2,12887.431650,48767,108,18,38,0,2
3,7645.861827,0,106,18,65,0,7
4,2813.692575,43836,73,12,44,0,1
...,...,...,...,...,...,...,...
9129,23405.987980,71941,73,18,89,0,2
9130,3096.511217,21604,79,14,28,0,1
9131,8163.890428,0,85,9,37,3,2
9132,7524.442436,21941,96,34,3,0,3


## Train Test split

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test , y_train , y_test= train_test_split( x,y, test_size=0.25, random_state= 42)

In [10]:
x_train.shape

(6850, 7)

In [11]:
x_test.shape

(2284, 7)

## NORMALIZATION

In [21]:
from sklearn.preprocessing import PowerTransformer

In [24]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)
pt.fit(x_train)
x_train_pt = pt.transform(x_train)
df1_x_train_pt = pd.DataFrame(x_train_pt, columns=x_train.columns)
df1_x_train_pt.head(3)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,-0.154555,0.807677,1.360867,1.65733,1.471319,-0.500344,-1.152021
1,1.102186,-1.655812,1.806474,0.058035,0.799894,-0.500344,0.873446
2,0.434349,0.401585,0.584358,1.188175,0.16264,1.953933,1.126458


In [25]:
x_train.head()

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
8607,5389.499465,66429,136,35,93,0,1
2121,12786.66898,0,180,14,70,0,4
5361,7834.151482,36094,99,28,50,1,5
7003,8223.164916,95102,101,6,73,0,8
7416,9031.214859,0,122,15,47,0,9


In [29]:
x_test_pt = pt.transform(x_test)
df1_x_test_pt = pd.DataFrame(x_test_pt, columns=x_train.columns)
df1_x_test_pt.head(3)

Unnamed: 0,customer_lifetime_value,income,monthly_premium_auto,months_since_last_claim,months_since_policy_inception,number_of_open_complaints,number_of_policies
0,-0.580388,0.892951,0.782412,-1.675324,0.12905,-0.500344,-1.152021
1,-0.116343,0.148364,-0.709595,-0.698111,-1.431989,-0.500344,0.873446
2,-0.771454,1.079498,0.420686,-1.278645,-0.254527,-0.500344,-1.152021


## MODELING

In [14]:
from sklearn.linear_model import LinearRegression as LinReg
linreg=LinReg() #model
linreg.fit(x_train , y_train)  #model train
y_pred_linreg=linreg.predict(x_test) #model predition


In [28]:
#Modelo con datos transformados (PowerTransformation yeo johnson)
linreg_pt = LinReg()
linreg_pt.fit(df1_x_train_pt, y_train)
y_pred_train_pt = linreg_pt.predict(df1_x_train_pt)

train_score_pt=linreg_pt.score(df1_x_train_pt, y_train)
train_score_pt

0.4359520131133471

In [15]:
train_score=linreg.score(x_train , y_train)
test_score=linreg.score(x_test, y_test)
train_score , test_score

(0.5217065876763485, 0.5083881701685621)

In [19]:
from sklearn.metrics import mean_squared_error as mse
train_mse = mse(y_pred_train,y_train)
test_mse = mse(y_pred_test,y_test)
train_mse, test_mse

(40907.80860013017, 39766.476073207465)

In [20]:
from sklearn.metrics import mean_absolute_error as mae
train_mae = mae(y_pred_train,y_train)
test_mae = mae(y_pred_test,y_test)
train_mae, test_mae

(144.6228085334776, 144.2248907750531)