# Import the libraries

In [139]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Import the dataset

In [140]:
data = pd.read_csv('data/churn_data.csv')
data.head()

Unnamed: 0,Budget,Profit,Spending,Cities,Churn_Rate
0,165349.2,136897.8,471784.1,Auckland,0.922618
1,162597.7,151377.59,443898.53,Wellington,0.917921
2,153441.51,101145.55,407934.54,Christchurch,0.910504
3,144372.41,118671.85,383199.62,Auckland,0.82902
4,142107.34,91391.77,366168.42,Christchurch,0.661879


# Data preparation

In [141]:
#one hot encoding
data = pd.get_dummies(data, columns = ['Cities'])
data.head()

Unnamed: 0,Budget,Profit,Spending,Churn_Rate,Cities_Auckland,Cities_Christchurch,Cities_Wellington
0,165349.2,136897.8,471784.1,0.922618,1,0,0
1,162597.7,151377.59,443898.53,0.917921,0,0,1
2,153441.51,101145.55,407934.54,0.910504,0,1,0
3,144372.41,118671.85,383199.62,0.82902,1,0,0
4,142107.34,91391.77,366168.42,0.661879,0,1,0


In [107]:
#prepare the feature and target label
y = data['Churn_Rate'].values
X = data.drop(['Churn_Rate'], axis=1).values

In [108]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [109]:
# Fitting Multiple Linear Regression to the Training set
regression = LinearRegression()
regression.fit(X_train, y_train)

LinearRegression()

In [110]:
X_test

array([[6.6051520e+04, 1.8264556e+05, 1.1814820e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.0067196e+05, 9.1790610e+04, 2.4974455e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.0191308e+05, 1.1059411e+05, 2.2916095e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [2.7892920e+04, 8.4710770e+04, 1.6447071e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.5344151e+05, 1.0114555e+05, 4.0793454e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [7.2107600e+04, 1.2786455e+05, 3.5318381e+05, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [2.0229590e+04, 6.5947930e+04, 1.8526510e+05, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [6.1136380e+04, 1.5270192e+05, 8.8218230e+04, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [7.3994560e+04, 1.2278275e+05, 3.0331926e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.4210734e+05, 9.1391770e+04,

In [111]:
# Predicting the Test set results
y_pred = regression.predict(X_test)

In [112]:
df = pd.DataFrame(data=y_test, columns=['y_test'])
df['y_pred'] = y_pred

In [113]:
df

Unnamed: 0,y_test,y_pred
0,0.032824,0.168031
1,0.442594,0.41512
2,0.461219,0.25723
3,0.222012,0.150833
4,0.910504,0.396264
5,0.050083,0.432134
6,0.187709,0.296958
7,0.025164,0.291946
8,0.103522,0.265624
9,0.661879,0.364305


In [119]:
# Predicting the sigle observation results. Here 1,0,0 represents that the state is Auckland
a = [165349.20,136897.80,471784.10,1,0,0]
b = np.array(a)
b = b.reshape(1, -1)
b
y_pred_single_obs = regression.predict(b)
#round(float(y_pred_single_obs), 2)

In [120]:
#Model Evaluation
r2_score(y_test, y_pred)

0.17366543990338135

# Saving the model

In [121]:
'''scikit-learn has their own model persistence method we will use: joblib. 
This is more efficient to use with scikit-learn models due to it being better at handling larger numpy arrays that may 
be stored in the models.'''

import joblib
joblib.dump(regression, "multiple_regression_model.pkl")

['multiple_regression_model.pkl']

In [124]:
import joblib
Cities_Auckland = 1
Cities_Wellington = 0
Cities_Christchurch = 0
Budget = 160349
Profit = 134321
Spending = 401400
pred_args = [Budget,Profit,Spending,Cities_Auckland,Cities_Wellington,Cities_Christchurch]
pred_args_arr = np.array(pred_args)
pred_args_arr = pred_args_arr.reshape(1, -1)
mul_reg = open("multiple_regression_model.pkl","rb")
ml_model = joblib.load(mul_reg)
model_prediction = ml_model.predict(pred_args_arr)

round(float(model_prediction), 2)

0.54