# Linear Regression

In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split

#customize the chat template
sns.set_style('darkgrid')
sns.set_palette('bright')

In [2]:
#load the dataset
data=pd.read_csv('insurance.csv')

data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
#encode tthe categorical variables
cat_cols=data.select_dtypes(include='object').columns

cat_cols

Index(['sex', 'smoker', 'region'], dtype='object')

In [4]:
cat_cols=list(data.select_dtypes(include='object').columns)

cat_cols

['sex', 'smoker', 'region']

In [5]:
for i in cat_cols:
    encoder=LabelEncoder()
    data[i]=encoder.fit_transform(data[i])

data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [6]:
#split the dataset into train set and test set
X=data.drop(columns=['charges'])
y=data['charges']

#do train test split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=1)

In [7]:
for split in [X_train,X_test,y_train,y_test]:
    print(split.shape)

(1003, 6)
(335, 6)
(1003,)
(335,)


(1003,6) - this implies a rows and columns


(1003,) - this implies a vector

In [8]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
1043,28,0,25.8,0,0,3
968,21,1,25.745,2,0,0
594,41,1,40.26,0,0,2
1079,63,1,33.66,3,0,2
1051,64,1,26.41,0,0,0


In [9]:
X_test.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
559,19,1,35.53,0,0,1
1087,57,1,31.54,0,0,1
1020,51,1,37.0,0,0,3
460,49,0,36.63,3,0,2
802,21,1,22.3,1,0,3


In [10]:
y_train

1043     3161.45400
968      3279.86855
594      5709.16440
1079    15161.53440
1051    14394.55790
           ...     
715     12146.97100
905      4564.19145
1096    44641.19740
235     19444.26580
1061    11554.22360
Name: charges, Length: 1003, dtype: float64

In [11]:
y_test

559      1646.42970
1087    11353.22760
1020     8798.59300
460     10381.47870
802      2103.08000
           ...     
1192    13019.16105
628     11365.95200
1098    23045.56616
1038     2250.83520
936     32108.66282
Name: charges, Length: 335, dtype: float64

In [12]:
#Scale the data
scaler=StandardScaler() #init the scaler
columns=X_train.columns
scaler.fit(X_train)     #scale the train set
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
X_train=pd.DataFrame(X_train, columns=columns)
X_test=pd.DataFrame(X_test, columns=columns)

In [13]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
0,-0.779886,-1.027292,-0.805683,-0.907059,-0.517702,1.324358
1,-1.280350,0.973434,-0.814660,0.739602,-0.517702,-1.382839
2,0.149548,0.973434,1.554541,-0.907059,-0.517702,0.421959
3,1.722435,0.973434,0.477260,1.562933,-0.517702,0.421959
4,1.793930,0.973434,-0.706116,-0.907059,-0.517702,-1.382839
...,...,...,...,...,...,...
998,1.507950,0.973434,-0.299688,-0.907059,-0.517702,1.324358
999,-0.922875,-1.027292,-0.225420,0.739602,-0.517702,-1.382839
1000,0.864496,-1.027292,0.689452,0.739602,1.931614,-1.382839
1001,0.078053,-1.027292,-1.390026,0.739602,1.931614,0.421959


In [14]:
#Train the model
model=LinearRegression()
model.fit(X_train,y_train) #fit the model to the data
train_prediction=model.predict(X_train) #to get train prediction
test_prediction=model.predict(X_test)   #to get test prediction

In [17]:
#Evaluate the model
train_rsme=root_mean_squared_error(y_train, train_prediction)
test_rsme=root_mean_squared_error(y_test, test_prediction)

print(f'train rsme: {train_rsme}')
print(f'test rsme: {test_rsme}')


train rsme: 6070.3386039462575
test rsme: 5983.901136059918
