In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pickle

In [4]:
df = pd.read_csv("insurance_pre.csv")

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [6]:
df = pd.get_dummies(df, drop_first=True).astype(int)
df.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27,0,16884,0,1
1,18,33,1,1725,1,0
2,28,33,3,4449,1,0
3,33,22,0,21984,1,0
4,32,28,0,3866,1,0


In [7]:
df.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [8]:
independent =  df[['age', 'bmi', 'children',
       'sex_male', 'smoker_yes']]

In [9]:
independent.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27,0,0,1
1,18,33,1,1,0
2,28,33,3,1,0
3,33,22,0,1,0
4,32,28,0,1,0


In [10]:
dependent = df[["charges"]]
dependent.head()

Unnamed: 0,charges
0,16884
1,1725
2,4449
3,21984
4,3866


In [19]:
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.3, random_state=42)

In [20]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [21]:
y_pred = regressor.predict(X_test)

In [22]:
y_pred

array([[ 8615.16996937],
       [ 7049.83506644],
       [36683.09874014],
       [ 9451.62739824],
       [26696.33726506],
       [11080.54013577],
       [  -56.16181942],
       [16846.63960489],
       [  642.78919301],
       [11232.39295156],
       [28555.37096377],
       [ 9367.16832665],
       [ 5317.91266479],
       [38716.12234063],
       [40420.10501368],
       [37218.1811819 ],
       [15301.20214172],
       [35980.30963204],
       [ 9438.9016344 ],
       [31182.7583382 ],
       [ 4154.33579429],
       [10554.34536219],
       [ 2738.52306269],
       [ 6367.94938141],
       [11352.96941097],
       [12427.74764632],
       [14994.2761142 ],
       [ 5992.12215849],
       [ 9470.67945866],
       [ 2476.60209117],
       [ 9498.61918726],
       [12881.84301463],
       [ 4447.69067864],
       [ 3129.54206429],
       [ 5005.16180859],
       [12533.4762385 ],
       [ 2227.54225387],
       [ 8984.80626608],
       [33309.90393463],
       [32908.85591117],


In [23]:
regressor.coef_[0]

array([  261.92097152,   331.66754625,   433.19194525,   141.3445121 ,
       23626.14665612])

In [24]:
regressor.intercept_[0]

np.float64(-12329.346295842079)

In [25]:
actualscore = r2_score(y_test, y_pred)
actualscore

0.7681647919672667

In [26]:
filename = 'multiple_linear_regression_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(regressor, file)