In [3]:
import pandas as pd

pd.options.display.max_rows = 10


In [4]:
df = pd.read_csv("/data/insurance.csv")
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
pd.get_dummies(df, drop_first=True)

Unnamed: 0,age,bmi,children,charges,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


In [10]:
target = "charges"
y = df[target]
X = df.copy()
del X[target]
X = pd.get_dummies(df.copy(), drop_first=True)


Z-score = (x - mean(x))/std(x) for every column x, it makes the all the features 0 mean and 1 std

In [11]:
from sklearn import preprocessing

In [13]:
scaler = preprocessing.StandardScaler()
scaler.fit(X) # calculating mean and std for each column
X_std = scaler.transform(X) # using mean and std calculates the std

In [14]:
X.describe()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,0.505232,0.204783,0.2429,0.272048,0.2429
std,14.04996,6.098187,1.205493,0.50016,0.403694,0.428995,0.445181,0.428995
min,18.0,15.96,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,26.29625,0.0,0.0,0.0,0.0,0.0,0.0
50%,39.0,30.4,1.0,1.0,0.0,0.0,0.0,0.0
75%,51.0,34.69375,2.0,1.0,0.0,0.0,1.0,0.0
max,64.0,53.13,5.0,1.0,1.0,1.0,1.0,1.0


In [15]:
pd.DataFrame(X_std).describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,-1.853691e-16,-1.869041e-16,2.7216230000000002e-17,-7.567439e-17,2.142448e-16,-2.846087e-16,1.186561e-16,1.481127e-16
std,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374,1.000374
min,-1.509965,-2.412011,-0.9086137,-1.010519,-0.5074631,-0.5664179,-0.6113237,-0.5664179
25%,-0.8691547,-0.7164063,-0.9086137,-1.010519,-0.5074631,-0.5664179,-0.6113237,-0.5664179
50%,-0.01474046,-0.0432088,-0.07876719,0.9895908,-0.5074631,-0.5664179,-0.6113237,-0.5664179
75%,0.8396738,0.6611572,0.7510793,0.9895908,-0.5074631,-0.5664179,1.635795,-0.5664179
max,1.765289,3.685522,3.240619,0.9895908,1.970587,1.765481,1.635795,1.765481


In [16]:
from sklearn import model_selection

In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_std, y
                                        , test_size = 0.3, random_state = 1)

In [18]:
import numpy as np

In [70]:
np.random.seed(1234)
np.random.random(5)

array([0.19151945, 0.62210877, 0.43772774, 0.78535858, 0.77997581])

In [71]:
from sklearn import linear_model

In [74]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [77]:
y_test_pred = model.predict(X_test)

In [83]:
result = pd.DataFrame({"actual": y_test, "predicted": y_test_pred})
result["error"] = result.actual - result.predicted
mse = np.mean(result.error ** 2)
rmse = np.sqrt(mse)
rmse

6063.122656850448

In [84]:
np.std(y_test)

11904.481211649017

In [85]:
model.intercept_ # theta0

13335.50190349196

In [87]:
model.coef_ # these are theta1, theta2 ... 

array([3534.79856636, 1965.25421993,  415.69647563, -141.3904328 ,
       9709.00227029, -130.40449685, -418.02075243, -372.30969715])

In [88]:
pd.DataFrame({"feature": X.columns, "theta": model.coef_})

Unnamed: 0,feature,theta
0,age,3534.798566
1,bmi,1965.25422
2,children,415.696476
3,gender_male,-141.390433
4,smoker_yes,9709.00227
5,region_northwest,-130.404497
6,region_southeast,-418.020752
7,region_southwest,-372.309697


y_hat = 13335.50190349196 + 3534.798566 * age + 1965.254220 * bmi + ...

In [90]:
SST = np.sum((y_test - np.mean(y_train)) ** 2)

In [91]:
SSE = np.sum((y_test - y_test_pred) ** 2)

In [93]:
SSE/SST

0.2594002695488973

In [96]:
r2 = 1 - SSE / SST # tells you is the model good enough?
# best possible value of r2 is 1
# exect min value of r2 is 0
# negative value of r2 means the model is not useful .. it is random 

In [95]:
r2

0.7405997304511027

In [97]:
from sklearn import metrics

In [98]:
metrics.mean_squared_error(y_test, y_test_pred)

36761456.35201326

In [100]:
metrics.r2_score(y_test, y_test_pred)

0.7405989316927211

In [101]:
y_train_pred = model.predict(X_train)

In [102]:
metrics.r2_score(y_train, y_train_pred)

0.7545557492633161

In [103]:
metrics.mean_squared_error(y_train, y_train_pred)

36476790.76410686

In [105]:
print("Training mse:", metrics.mean_squared_error(y_train, y_train_pred),
     "\nTest mse:", metrics.mean_squared_error(y_test, y_test_pred), 
      "\nTraining r2:", metrics.r2_score(y_train, y_train_pred),
      "\nTest r2:", metrics.r2_score(y_test, y_test_pred)
     )

Training mse: 36476790.76410686 
Test mse: 36761456.35201326 
Training r2: 0.7545557492633161 
Test r2: 0.7405989316927211


In [107]:
from sklearn import pipeline

In [110]:
target = "charges"
y = df[target]
X = df.copy()
del X[target]
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                                        , test_size = 0.3, random_state = 1)


pipe = pipeline.Pipeline([
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])


                                        
pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("Training mse:", metrics.mean_squared_error(y_train, y_train_pred),
     "\nTest mse:", metrics.mean_squared_error(y_test, y_test_pred), 
      "\nTraining r2:", metrics.r2_score(y_train, y_train_pred),
      "\nTest r2:", metrics.r2_score(y_test, y_test_pred)
     )

Training mse: 36476790.76410686 
Test mse: 36761456.352013275 
Training r2: 0.7545557492633161 
Test r2: 0.7405989316927211
