In [89]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, linear_model, model_selection, metrics, pipeline

In [2]:
df = pd.read_csv("/data/insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
target = "charges"
X = df.copy()
del X[target]
y = df[target]

In [9]:
X = pd.get_dummies(X, drop_first=True)

In [11]:
X.head()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [12]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                        test_size = 0.3, random_state = 1)

In [56]:
np.random.seed(1)
np.random.random([10])

array([4.17022005e-01, 7.20324493e-01, 1.14374817e-04, 3.02332573e-01,
       1.46755891e-01, 9.23385948e-02, 1.86260211e-01, 3.45560727e-01,
       3.96767474e-01, 5.38816734e-01])

In [58]:
X_train.shape[0]/len(df)

0.6995515695067265

In [63]:
# Z = (X - X.mean()) / X.std() ... for each column 
# Goal is to have 0 mean and 1 standard deviation of each feature
scaler = preprocessing.StandardScaler()
scaler.fit(X_train) # it calculate X.mean() and X.std()
X_train_std = scaler.transform(X_train) # calculate Z score for each column
X_test_std = scaler.transform(X_test)
pd.DataFrame(X_train_std) 

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.797152,-0.702114,-0.904002,0.976766,-0.509664,1.788102,-0.600387,-0.587220
1,-1.271085,-0.703758,-0.085679,-1.023787,-0.509664,-0.559252,-0.600387,1.702939
2,-0.985811,-0.733340,-0.904002,0.976766,-0.509664,-0.559252,-0.600387,-0.587220
3,0.797152,-0.414510,1.550967,-1.023787,-0.509664,-0.559252,1.665591,-0.587220
4,0.868471,1.482037,-0.085679,0.976766,-0.509664,-0.559252,-0.600387,1.702939
5,-0.985811,1.749098,-0.904002,-1.023787,-0.509664,-0.559252,-0.600387,-0.587220
6,-0.843174,0.117970,-0.904002,-1.023787,1.962076,-0.559252,-0.600387,1.702939
7,1.724293,-0.608437,-0.904002,-1.023787,1.962076,1.788102,-0.600387,-0.587220
8,-1.485041,0.525546,-0.904002,-1.023787,-0.509664,-0.559252,1.665591,-0.587220
9,-1.485041,-1.248563,-0.904002,0.976766,-0.509664,-0.559252,-0.600387,-0.587220


In [65]:
lr = linear_model.LinearRegression()
lr.fit(X_train_std, y_train) # Create the model

y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)

In [66]:
summary = pd.DataFrame({"actual": y_test, "predicted": y_test_pred})

In [67]:
summary["error"] = summary.actual - summary.predicted

In [68]:
summary.head()

Unnamed: 0,actual,predicted,error
559,1646.4297,4610.315541,-2963.885841
1087,11353.2276,12887.89388,-1534.66628
1020,8798.593,12573.948752,-3775.355752
460,10381.4787,13197.836626,-2816.357926
802,2103.08,629.337182,1473.742818


In [71]:
sse = np.sum(summary.error ** 2) # sum of squared error
sse

14778105453.509336

In [74]:
mse = sse / (len(y_test) - 1) # mean squared error

In [73]:
mse

36853130.80675645

In [75]:
rmse = np.sqrt(mse) # root mean squared error

In [76]:
rmse

6070.677952811898

In [77]:
y_baseline = np.mean(y_train)

In [79]:
sst = np.sum((y_baseline - y_test) ** 2) # sum of squared total

In [80]:
sse/sst 
# A. < 0, B. (0, 1) C. > 1.0

0.2594002695488974

In [81]:
r2 = 1 - sse/sst

In [82]:
r2

0.7405997304511026

In [88]:
target = "charges"
X = df.copy()
del X[target]
y = df[target]

X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                        test_size = 0.3, random_state = 1)

scaler = preprocessing.StandardScaler()
scaler.fit(X_train) # it calculate X.mean() and X.std()
X_train_std = scaler.transform(X_train) # calculate Z score for each column
X_test_std = scaler.transform(X_test)
pd.DataFrame(X_train_std) 

lr = linear_model.LinearRegression()
lr.fit(X_train_std, y_train) # Create the model

y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)

print("training r2: %f\ntest r2: %f\ntraining rmse: %f\ntest: rmse: %f" %(
    metrics.r2_score(y_train, y_train_pred),
    metrics.r2_score(y_test, y_test_pred),
    np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)),
    np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
))


training r2: 0.754556
test r2: 0.740599
training rmse: 6039.601871
test: rmse: 6063.122657


In [91]:
target = "charges"
X = df.copy()
del X[target]
y = df[target]

X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                        test_size = 0.3, random_state = 1)

est = pipeline.Pipeline([
    ("scaler", preprocessing.StandardScaler()),
    ("lr", linear_model.LinearRegression())
])
est.fit(X_train, y_train)

y_train_pred = est.predict(X_train)
y_test_pred = est.predict(X_test)

print("training r2: %f\ntest r2: %f\ntraining rmse: %f\ntest: rmse: %f" %(
    metrics.r2_score(y_train, y_train_pred),
    metrics.r2_score(y_test, y_test_pred),
    np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)),
    np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
))

training r2: 0.754556
test r2: 0.740599
training rmse: 6039.601871
test: rmse: 6063.122657
