Importing the Dependencies

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

In [41]:
#read dataset
df=pd.read_csv('Medical Price Dataset.csv')

In [42]:
#top 5 records of dataset
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [43]:
#shape of dataset
df.shape

(1338, 7)

Data Preprocessing

In [44]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

There is no missing values in dataset

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Converting labels into numerical using LabelEncoder

In [46]:
label = LabelEncoder()
df['sex'] = label.fit_transform(df['sex'])
df['smoker'] = label.fit_transform(df['smoker'])
df['region'] = label.fit_transform(df['region'])

In [47]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


Linear Regression

In [48]:
class Linear_Regression():

   def __init__( self, learning_rate, no_of_iterations ) :
        self.learning_rate = learning_rate  
        self.no_of_iterations = no_of_iterations

    # fit function to train the model
   def fit( self, X, Y ) : 
        # no_of_training_examples, no_of_features
        self.m, self.n = X.shape
          
        # initiating the weight and bias
        self.w = np.zeros( self.n )
        self.b = 0
        self.X = X
        self.Y = Y

        # implementing Gradient Descent for Optimization              
        for i in range( self.no_of_iterations ) :              
            self.update_weights()        
      
    # function to update weights in gradient descent      
   def update_weights( self ) :             
        Y_prediction = self.predict( self.X )
          
        # calculate gradients        
        dw =  - ( 2 * ( self.X.T ).dot( self.Y - Y_prediction ) ) / self.m
        db =  - 2 * np.sum( self.Y - Y_prediction ) / self.m 

        # updating the weights      
        self.w = self.w - self.learning_rate * dw
        self.b = self.b - self.learning_rate * db

    # Line function for prediction:
   def predict( self, X ) :
        return X.dot( self.w ) + self.b 


In [49]:
X = df.drop(['charges'], axis = 1)
Y = df['charges']

In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state = 2)

In [51]:
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [52]:
model=Linear_Regression(learning_rate = 0.02, no_of_iterations=1000)

In [53]:
model.fit(X_train, Y_train)

In [54]:
Y_pred = model.predict(X_test)

In [55]:
print(Y_pred)

[ 2206.44316224 12128.57168476 10623.72352246  2553.99507214
  8496.33081026 11334.57127277  3600.78567219  1491.87354869
 12053.67237523  9687.04284963 11530.01606991  5298.05425886
 29268.89146356  -113.61306659 13057.64477726 13297.76941285
  4458.38935146  8016.69742893 28736.762252    3175.92899166
 12397.58851742  1946.81219006 33185.7788866  31519.95883128
 29738.99614837  8365.14681972  2663.48228394 15472.15475852
  6223.59633478  2179.56538315  9728.73954933  5922.87061232
  4062.96729546  5026.59145543  9808.87396178  4985.11351209
 29370.55649373  6294.31116789 27032.09061434 14012.78100246
   436.01328991 27109.40919653  7852.26409744  1138.85613929
 10517.7574625   7662.61571033 11564.31757026  8444.80707016
 11062.62142235 13204.54814216  6606.32074965 -1151.07748637
 25589.06446254 36130.70422784  6927.53510183 17413.63133695
   974.68916092 11237.72122195  1818.20651675 33336.4749534
 11422.00145925   480.42763931  4137.13269517 35205.45363707
 -1452.10030637 14451.187

In [57]:
accuracy=r2_score(Y_test,Y_pred)
print(accuracy*100)

76.98751296167457
