In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/data/insurance.csv")
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
target = "charges"
X = df.drop(columns=target)
y = df[target]

In [8]:
X_dummy = pd.get_dummies(X, drop_first=True)
X_dummy.head()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [11]:
import numpy as np

In [61]:
np.random.seed(134234)
(np.random.random(10) * 100).astype("int")

array([59, 29, 88,  3, 56, 37, 35, 53, 26, 32])

In [98]:
from sklearn import model_selection, linear_model, metrics, preprocessing

In [67]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_dummy, y
                                            , test_size = 0.3, random_state = 1)

In [68]:
len(X_train)/len(df)

0.6995515695067265

In [69]:
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [70]:
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

In [72]:
summary = pd.DataFrame({"actual": y_train, "prediction": y_train_pred})
summary["residual"] = summary.actual - summary.prediction
summary.head()

Unnamed: 0,actual,prediction,residual
744,8827.2099,9472.264706,-645.054806
363,2597.779,2233.928867,363.850133
10,2721.3208,3423.054245,-701.733445
970,10702.6424,10718.882756,-16.240356
634,9391.346,13789.363595,-4398.017595


In [75]:
mse_training = np.mean(summary.residual ** 2)
mse_training

36476790.76410683

In [80]:
rmse_training = np.sqrt(mse_training)
rmse_training

6039.601871324536

In [77]:
metrics.mean_squared_error(y_train, y_train_pred)

36476790.76410686

In [78]:
mae_train = np.mean(np.abs(summary.residual))
mae_train

4154.405109254062

In [81]:
rmse_training = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

rmse_training, rmse_test


(6039.6018713245385, 6063.12265685045)

In [82]:
baseline = np.mean(y_train)

In [86]:
SST = np.sum((y_train - baseline) **2)
SSE = np.sum((y_train-y_train_pred) ** 2)
SSE/SST

0.24544425073668388

In [87]:
r2 = 1 - SSE/SST
r2

0.7545557492633161

In [89]:
metrics.r2_score(y_train, y_train_pred), metrics.r2_score(y_test, y_test_pred)

(0.7545557492633161, 0.7405989316927212)

In [100]:
target = "charges"
X = df.drop(columns=target)
y = df[target]

X_dummy = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_dummy, y
                                            , test_size = 0.3, random_state = 1)


scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train.values.astype("float"))
X_test = scaler.transform(X_test.values.astype("float"))

lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

rmse_training = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

r2_training = metrics.r2_score(y_train, y_train_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

print("rmse_training: ", rmse_training)
print("rmse_test", rmse_test)
print("r2_training: ", r2_training)
print("r2_test", r2_test)

rmse_training:  6039.6018713245385
rmse_test 6063.122656850451
r2_training:  0.7545557492633161
r2_test 0.7405989316927211


In [101]:
pd.DataFrame({"coefficient": lr.coef_, "feature": X_dummy.columns})

Unnamed: 0,coefficient,feature
0,3528.982731,age
1,1961.655208,bmi
2,421.550164,children
3,-141.35911,gender_male
4,9733.786883,smoker_yes
5,-129.545886,region_northwest
6,-414.541483,region_southeast
7,-379.095344,region_southwest


In [102]:
lr.intercept_

13276.698553898505