In [1]:
import pandas as pd
import numpy as np
import datetime
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

%matplotlib inline

pd.set_option('display.max_columns', None)

In [2]:
customer_df = pd.read_csv("we_fn_use_c_marketing_customer_value_analysis.csv")

In [3]:
customer_df.columns = map(str.lower, customer_df.columns)
customer_df.columns = customer_df.columns.str.replace(' ','_')
customer_df=customer_df.rename(columns = {'effective_to_date':'datetime'})
customer_df['datetime'].dtypes
customer_df['datetime']= pd.to_datetime(customer_df['datetime'])
customer_df['datetime'].dtypes

dtype('<M8[ns]')

In [4]:
categorical = customer_df.select_dtypes(include='object')
discrete = pd.DataFrame(customer_df,columns=['customer_lifetime_value','income','monthly_premium_auto',
                       'number_of_open_complaints','number_of_policies','total_claim_amount'])
continuous = pd.DataFrame(customer_df,columns=['months_since_last_claim','months_since_policy_inception','datetime'])

In [5]:
cat = categorical.columns
x = [j for i in cat for j in categorical[i].unique()[1:]]

### ONE HOT ENCODING for categorical data

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(categorical)
encoded = encoder.transform(categorical).toarray()
cat_data = pd.DataFrame(encoded , columns = x)

In [6]:
continuous['year'] = continuous['datetime'].dt.year
continuous['month'] = continuous['datetime'].dt.month
continuous['week'] = continuous['datetime'].dt.week

In [7]:
customer_final = pd.concat([discrete,continuous,cat_data], axis=1)

In [8]:
customer_final = customer_final.drop('datetime',axis=1)

Lab : Regression Model

1.Train-test split with Linear Regression

In [9]:
y = customer_final['total_claim_amount']

X = customer_final.drop('total_claim_amount',axis=1)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19698)

In [12]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

In [14]:
lm = linear_model.LinearRegression()

In [15]:
lm_model = lm.fit(X_train,y_train)

In [16]:
lm_predictions = lm_model.predict(X_test)

In [23]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

MAE = mean_absolute_error(y_test,lm_predictions)
MSE = mean_squared_error(y_test,lm_predictions)
RMSE = np.sqrt(MSE)
R2 = r2_score(y_test,lm_predictions)

print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
print("The R2 of the model in the test set is: %4.2f" % (R2))

The mean absolute error of the model in the test set is: 121.72
The mean squared error of the model in the test set is: 26052.39
The root mean squared error of the model in the test set is: 161.41
The R2 of the model in the test set is: 0.68


Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

In [24]:
def model_test(z):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19698)
    lm = z
    lm_model = lm.fit(X_train,y_train)
    lm_predictions = lm_model.predict(X_test)
    MAE = mean_absolute_error(y_test,lm_predictions)
    MSE = mean_squared_error(y_test,lm_predictions)
    RMSE = np.sqrt(MSE)
    R2 = r2_score(y_test,lm_predictions)
    print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
    print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
    print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
    print("The R2 of the model in the test set is: %4.2f" % (R2))

In [28]:
model_test(linear_model.LinearRegression())

The mean absolute error of the model in the test set is: 121.72
The mean squared error of the model in the test set is: 26052.39
The root mean squared error of the model in the test set is: 161.41
The R2 of the model in the test set is: 0.68


In [30]:
from sklearn.neighbors import KNeighborsRegressor

model_test(KNeighborsRegressor(n_neighbors=2))

The mean absolute error of the model in the test set is: 174.72
The mean squared error of the model in the test set is: 66726.84
The root mean squared error of the model in the test set is: 258.32
The R2 of the model in the test set is: 0.17


In [31]:
model_test(KNeighborsRegressor(n_neighbors=3))

The mean absolute error of the model in the test set is: 168.40
The mean squared error of the model in the test set is: 62065.64
The root mean squared error of the model in the test set is: 249.13
The R2 of the model in the test set is: 0.23


In [51]:
def kneighbors_test(z):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19698)
    km = KNeighborsRegressor(n_neighbors=z)
    km_model = km.fit(X_train,y_train)
    km_predictions = km_model.predict(X_test)
    km_scores= scores.append(model.score(X_test, y_test))
    MAE = mean_absolute_error(y_test,km_predictions)
    MSE = mean_squared_error(y_test,km_predictions)
    RMSE = np.sqrt(MSE)
    R2 = r2_score(y_test,km_predictions)
    print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
    print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
    print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
    print("The R2 of the model in the test set is: %4.2f" % (R2))

In [42]:
kneighbors_test(2)

The mean absolute error of the model in the test set is: 174.72
The mean squared error of the model in the test set is: 66726.84
The root mean squared error of the model in the test set is: 258.32
The R2 of the model in the test set is: 0.17


In [43]:
kneighbors_test(3)

The mean absolute error of the model in the test set is: 168.40
The mean squared error of the model in the test set is: 62065.64
The root mean squared error of the model in the test set is: 249.13
The R2 of the model in the test set is: 0.23


In [44]:
kneighbors_test(4)

The mean absolute error of the model in the test set is: 166.13
The mean squared error of the model in the test set is: 59823.17
The root mean squared error of the model in the test set is: 244.59
The R2 of the model in the test set is: 0.26


In [45]:
kneighbors_test(5)

The mean absolute error of the model in the test set is: 167.15
The mean squared error of the model in the test set is: 58967.42
The root mean squared error of the model in the test set is: 242.83
The R2 of the model in the test set is: 0.27


In [46]:
kneighbors_test(6)

The mean absolute error of the model in the test set is: 167.31
The mean squared error of the model in the test set is: 58122.67
The root mean squared error of the model in the test set is: 241.09
The R2 of the model in the test set is: 0.28


In [47]:
for i in range(2,10):
    kneighbors_test(i)

The mean absolute error of the model in the test set is: 174.72
The mean squared error of the model in the test set is: 66726.84
The root mean squared error of the model in the test set is: 258.32
The R2 of the model in the test set is: 0.17
The mean absolute error of the model in the test set is: 168.40
The mean squared error of the model in the test set is: 62065.64
The root mean squared error of the model in the test set is: 249.13
The R2 of the model in the test set is: 0.23
The mean absolute error of the model in the test set is: 166.13
The mean squared error of the model in the test set is: 59823.17
The root mean squared error of the model in the test set is: 244.59
The R2 of the model in the test set is: 0.26
The mean absolute error of the model in the test set is: 167.15
The mean squared error of the model in the test set is: 58967.42
The root mean squared error of the model in the test set is: 242.83
The R2 of the model in the test set is: 0.27
The mean absolute error of the m

In [56]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

model_test(MLPRegressor(random_state=1,max_iter=500))

The mean absolute error of the model in the test set is: 120.65
The mean squared error of the model in the test set is: 30722.06
The root mean squared error of the model in the test set is: 175.28
The R2 of the model in the test set is: 0.62


In [57]:
def model_test(z):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=19698)
    lm = z
    lm_model = lm.fit(X_train,y_train)
    lm_predictions = lm_model.predict(X_test)
    lm_score =lm.score(X_test, y_test)
    MAE = mean_absolute_error(y_test,lm_predictions)
    MSE = mean_squared_error(y_test,lm_predictions)
    RMSE = np.sqrt(MSE)
    R2 = r2_score(y_test,lm_predictions)
    print("The mean absolute error of the model in the test set is: %6.2f" % (MAE))
    print("The mean squared error of the model in the test set is: %6.2f" % (MSE))
    print("The root mean squared error of the model in the test set is: %6.2f" % (RMSE))
    print("The R2 of the model in the test set is: %4.2f" % (R2))

In [58]:
model_test(MLPRegressor(random_state=1,max_iter=500))

The mean absolute error of the model in the test set is: 120.65
The mean squared error of the model in the test set is: 30722.06
The root mean squared error of the model in the test set is: 175.28
The R2 of the model in the test set is: 0.62
