In [3]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [4]:
data = pd.read_csv('insurance.csv')

In [5]:
data_new = pd.read_csv('insurance.csv')

In [6]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
#Make dataset more useable
#0-Female 1-Male
data.sex = (data.sex == 'male').astype(int)

#smoker to binary
data.smoker = (data.smoker == 'yes').astype(int)

In [8]:
#Change all regions to binary
data = pd.get_dummies(data, columns=['region'])

In [9]:
#Run OLS Model
regr = linear_model.LinearRegression()
Y = data.charges.values.reshape(-1,1)
X = data.drop(columns='charges')
regr.fit(X,Y)



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
#Results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[  256.85635254  -131.3143594    339.19345361   475.50054515
  23848.53454191   587.00923503   234.0453356   -448.01281436
   -373.04175627]]

Intercept: 
 [-12525.5478112]

R-squared:
0.7509130345985208


In [11]:
for i in range(10):
    data = data.sample(frac=1)
    Y = data.charges.values.reshape(-1,1)
    X = data.drop(columns='charges')
    print(cross_val_score(regr,X,Y, cv=4))

[0.73062262 0.7620704  0.77325052 0.71266042]
[0.70221414 0.7295845  0.80047195 0.74575269]
[0.75875841 0.73577771 0.70244803 0.78052378]
[0.68708562 0.77990276 0.76260898 0.73607191]
[0.77551552 0.67882213 0.76910537 0.74788718]
[0.72237184 0.75961486 0.72270244 0.77737323]
[0.73941386 0.76806292 0.76481646 0.69816334]
[0.77673686 0.72177479 0.75134663 0.73148198]
[0.78110591 0.72913962 0.73500832 0.72752914]
[0.67430091 0.76853829 0.78139933 0.75041767]


In [12]:
#Run kNN Model

data_neighbors = KNeighborsRegressor(n_neighbors=5)
Y_kNN = data.charges
X_kNN = data.drop(columns='charges')
data_neighbors.fit(X_kNN,Y_kNN)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [13]:
#Results
print('\nR-squared:')
print(data_neighbors.score(X_kNN, Y_kNN))


R-squared:
0.46551163098846365


# Reasoning for OLS
Linear Regression is the better model for this data. We can use linear regression to find the best coefficients to closely predict any input given, whereas KNN is limited to only be able to predict values for which we have close matches - KNN would not be able to predict for any gaps in the data, like linear regression would. Additionally, linear regression is more compact as we only need to store the coefficients for the equation rather than needing the entire data set to find the closest k values to the input. Finally, we can see that OLS for this data is easily the more accurate model. If we had less of a direct relationship with the final charges, it would make sense to prefer KNN or use a combination of the two.