In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/data/insurance.csv")
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
target = "charges"
X = df.copy()
del X[target]
y = df[target]
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
5        3756.62160
6        8240.58960
7        7281.50560
8        6406.41070
9       28923.13692
10       2721.32080
11      27808.72510
12       1826.84300
13      11090.71780
14      39611.75770
15       1837.23700
16      10797.33620
17       2395.17155
18      10602.38500
19      36837.46700
20      13228.84695
21       4149.73600
22       1137.01100
23      37701.87680
24       6203.90175
25      14001.13380
26      14451.83515
27      12268.63225
28       2775.19215
29      38711.00000
           ...     
1308    33900.65300
1309     6875.96100
1310     6940.90985
1311     4571.41305
1312     4536.25900
1313    36397.57600
1314    18765.87545
1315    11272.33139
1316     1731.67700
1317     1163.46270
1318    19496.71917
1319     7201.70085
1320     5425.02335
1321    28101.33305
1322    12981.34570
1323    43896.37630
1324     4239.89265
1325    13143.33665
1326     7050.02130


In [11]:
# one hot encoding
X_dummy = pd.get_dummies(X, drop_first=True)
X_dummy.head()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [12]:
from sklearn import model_selection

In [13]:
X_train, X_test, y_train, y_test = \
model_selection.train_test_split(X_dummy, y
                                , test_size = 0.3, random_state = 1)

In [14]:
len(X_train)/len(X)

0.6995515695067265

In [15]:
X_train

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
744,50,26.410,0,1,0,1,0,0
363,21,26.400,1,0,0,0,0,1
10,25,26.220,0,1,0,0,0,0
970,50,28.160,3,0,0,0,1,0
634,51,39.700,1,1,0,0,0,1
554,25,41.325,0,0,0,0,0,0
314,27,31.400,0,0,1,0,0,1
419,63,26.980,0,0,1,1,0,0
525,18,33.880,0,0,0,0,1,0
1041,18,23.085,0,1,0,0,0,0


In [16]:
from sklearn import preprocessing

In [17]:
scaler = preprocessing.StandardScaler()

In [19]:
scaler.fit(X_train) # calcualte mean and std for each feature

StandardScaler(copy=True, with_mean=True, with_std=True)

In [21]:
X_train_std = scaler.transform(X_train) # Calculating z-score
X_test_std = scaler.transform(X_test)

In [24]:
pd.DataFrame(X_train_std).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0
mean,-1.791065e-16,-4.635418e-16,-2.910332e-16,1.7673420000000003e-17,8.979047e-17,-1.088873e-16,4.886879e-17,-2.3722710000000002e-17
std,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535
min,-1.485041,-2.419524,-0.9040023,-1.023787,-0.5096643,-0.5592522,-0.6003875,-0.5872202
25%,-0.9144925,-0.7247116,-0.9040023,-1.023787,-0.5096643,-0.5592522,-0.6003875,-0.5872202
50%,-0.05867016,-0.03076297,-0.08567913,0.9767656,-0.5096643,-0.5592522,-0.6003875,-0.5872202
75%,0.8684707,0.6631857,0.732644,0.9767656,-0.5096643,-0.5592522,1.665591,1.702939
max,1.795612,3.689196,3.187613,0.9767656,1.962076,1.788102,1.665591,1.702939


In [25]:
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [26]:
from sklearn import linear_model

In [28]:
lr = linear_model.LinearRegression()
lr.fit(X_train_std, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [29]:
lr.intercept_

13276.698553898505

In [30]:
lr.coef_

array([3528.98273114, 1961.65520818,  421.55016415, -141.35911014,
       9733.78688309, -129.54588605, -414.54148288, -379.09534369])

In [31]:
pd.DataFrame({"feature": X_dummy.columns, "coef": lr.coef_})

Unnamed: 0,feature,coef
0,age,3528.982731
1,bmi,1961.655208
2,children,421.550164
3,gender_male,-141.35911
4,smoker_yes,9733.786883
5,region_northwest,-129.545886
6,region_southeast,-414.541483
7,region_southwest,-379.095344


In [37]:
y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)

In [40]:
result = pd.DataFrame({"actual": y_test, "prediction": y_test_pred})
result["error"] = result.actual - result.prediction
result.head()

Unnamed: 0,actual,prediction,error
559,1646.4297,4610.315541,-2963.885841
1087,11353.2276,12887.89388,-1534.66628
1020,8798.593,12573.948752,-3775.355752
460,10381.4787,13197.836626,-2816.357926
802,2103.08,629.337182,1473.742818


In [44]:
SSE = np.sum(result.error ** 2)
SSE

14778105453.509336

In [45]:
SST = np.sum((y_test - np.mean(y_test)) ** 2)
SST

56970102513.23878

In [46]:
SSE/SST

0.25940106830727894

In [48]:
r2 = 1 - SSE/SST
r2

0.7405989316927211

In [49]:
MSE = np.mean(result.error ** 2)
MSE

36761456.35201328

In [50]:
RMSE = np.sqrt(MSE)
RMSE

6063.122656850452

In [51]:
from sklearn import metrics

In [52]:
metrics.mean_squared_error(y_test, y_test_pred)

36761456.352013275

In [54]:
target = "charges"
X = df.copy()
del X[target]
y = df[target]

X_dummy = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = \
model_selection.train_test_split(X_dummy, y
        , test_size = 0.3, random_state = 1)
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)


lr = linear_model.LinearRegression()
lr.fit(X_train_std, y_train)

y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)

print("training r2: %f, \ntest r2: %f, \ntraining rmse: %f \ntesting rmse: %f" % (
    metrics.r2_score(y_train, y_train_pred),
    metrics.r2_score(y_test, y_test_pred),
    np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)),
    np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
)) 

training r2: 0.754556, 
test r2: 0.740599, 
training rmse: 6039.601871 
testing rmse: 6063.122657


In [56]:
target = "charges"
X = df.copy()
del X[target]
y = df[target]

X["bmi_high"] = np.where(X.bmi > 30, 1, 0)
X["age_le_40"] = np.where(X.age < 40, 1, 0)


X_dummy = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = \
model_selection.train_test_split(X_dummy, y
        , test_size = 0.3, random_state = 1)
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)


lr = linear_model.LinearRegression()
lr.fit(X_train_std, y_train)

y_train_pred = lr.predict(X_train_std)
y_test_pred = lr.predict(X_test_std)

print("training r2: %f, \ntest r2: %f, \ntraining rmse: %f \ntesting rmse: %f" % (
    metrics.r2_score(y_train, y_train_pred),
    metrics.r2_score(y_test, y_test_pred),
    np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)),
    np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
)) 

training r2: 0.760765, 
test r2: 0.743648, 
training rmse: 5962.722927 
testing rmse: 6027.388939


In [57]:
from sklearn import pipeline

In [66]:
target = "charges"
X = df.copy()
del X[target]
y = df[target]

# Feature generation
X["bmi_high"] = np.where(X.bmi > 30, 1, 0)
X["age_le_40"] = np.where(X.age < 40, 1, 0)

# Convert the categorical features into one hot encoded values 
X_dummy = pd.get_dummies(X, drop_first=True)

# Split the data into training and test bucket
X_train, X_test, y_train, y_test = \
model_selection.train_test_split(X_dummy, y
        , test_size = 0.3, random_state = 1)

# Build a pipeline for data preparation and model building
pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=4, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])

# Perform training
pipe.fit(X_train_std, y_train)

# Make infereneces
y_train_pred = pipe.predict(X_train_std)
y_test_pred = pipe.predict(X_test_std)

# Measure performance
print("training r2: %f, \ntest r2: %f, \ntraining rmse: %f \ntesting rmse: %f" % (
    metrics.r2_score(y_train, y_train_pred),
    metrics.r2_score(y_test, y_test_pred),
    np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)),
    np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))
)) 

training r2: 0.917445, 
test r2: 0.537159, 
training rmse: 3502.711932 
testing rmse: 8098.909477


In [60]:
a = np.array([
    [3, 4, 2],
    [0, 4, 1],
    [-1, 3, 5],
    [1, 0, 2]
])
poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
poly.fit_transform(a)

array([[ 3.,  4.,  2.,  9., 12.,  6., 16.,  8.,  4.],
       [ 0.,  4.,  1.,  0.,  0.,  0., 16.,  4.,  1.],
       [-1.,  3.,  5.,  1., -3., -5.,  9., 15., 25.],
       [ 1.,  0.,  2.,  1.,  0.,  2.,  0.,  0.,  4.]])

In [63]:
a = np.random.random((10, 200))
poly = preprocessing.PolynomialFeatures(degree=3, include_bias=False)
poly.fit_transform(a).shape


(10, 1373700)

In [None]:
preprocessing.