In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing, model_selection, linear_model, metrics

%matplotlib inline

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/abulbasar/data/master/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df_dummy = pd.get_dummies(df, drop_first=True)
df_dummy.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [7]:
target = "charges"
y = df_dummy[target]
del df_dummy[target]
df_dummy.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [13]:
X = preprocessing.StandardScaler().fit_transform(df_dummy)
pd.DataFrame(X, columns=df_dummy.columns).head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,-1.438764,-0.45332,-0.908614,-1.010519,1.970587,-0.566418,-0.611324,1.765481
1,-1.509965,0.509621,-0.078767,0.989591,-0.507463,-0.566418,1.635795,-0.566418
2,-0.797954,0.383307,1.580926,0.989591,-0.507463,-0.566418,1.635795,-0.566418
3,-0.441948,-1.305531,-0.908614,0.989591,-0.507463,1.765481,-0.611324,-0.566418
4,-0.513149,-0.292556,-0.908614,0.989591,-0.507463,1.765481,-0.611324,-0.566418


In [10]:
type(X)

numpy.ndarray

In [15]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                    test_size = 0.3, random_state = 1)

In [40]:
np.random.seed(1)
np.random.random(5)

array([  4.17022005e-01,   7.20324493e-01,   1.14374817e-04,
         3.02332573e-01,   1.46755891e-01])

In [42]:
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [43]:
lr.intercept_, lr.coef_

(13335.501903491961,
 array([ 3534.79856636,  1965.25421993,   415.69647563,  -141.3904328 ,
         9709.00227029,  -130.40449685,  -418.02075243,  -372.30969715]))

In [44]:
pd.DataFrame({"feature": df_dummy.columns, "weight": lr.coef_})

Unnamed: 0,feature,weight
0,age,3534.798566
1,bmi,1965.25422
2,children,415.696476
3,sex_male,-141.390433
4,smoker_yes,9709.00227
5,region_northwest,-130.404497
6,region_southeast,-418.020752
7,region_southwest,-372.309697


In [45]:
y_test_pred = lr.predict(X_test)

In [53]:
result = pd.DataFrame({"actual": y_test, "prediction": y_test_pred})
result["error"] = result.actual - result.prediction
result.head()

Unnamed: 0,actual,prediction,error
559,1646.4297,4610.315541,-2963.885841
1087,11353.2276,12887.89388,-1534.66628
1020,8798.593,12573.948752,-3775.355752
460,10381.4787,13197.836626,-2816.357926
802,2103.08,629.337182,1473.742818


In [52]:
SSE = np.sum(result.error ** 2)
SSE

14778105453.509331

In [55]:
MSE = SSE / len(y_test)
MSE

36761456.35201326

In [57]:
RMSE = np.sqrt(MSE)
RMSE

6063.1226568504499

In [61]:
y_train_mean = np.mean(y_train)
SST = np.sum((y_train_mean - y_test) ** 2)
SST

56970277938.449242

In [62]:
SSE/SST

0.25940026954889728

In [64]:
R2 = 1 - SSE/SST
R2

0.74059973045110272

In [66]:
metrics.r2_score(y_test, y_test_pred)

0.74059893169272106

In [67]:
y_test_mean = np.mean(y_test)
SST = np.sum((y_test_mean - y_test) ** 2)
1 - SSE/SST

0.74059893169272106