In [1]:
import get_data
import numpy as np
import pandas as pd
import regression_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error

In [2]:
data = pd.read_csv('./insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
inspector = get_data.get('./insurance.csv')
by_row,by_feature = inspector.detect_missing_values()

In [4]:
by_feature

Unnamed: 0,missing count
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [5]:
target = data['charges']
data = data.drop(labels='charges',axis=1)

In [6]:
numeric_data = data.select_dtypes(include=np.number)
numeric_data

Unnamed: 0,age,bmi,children
0,19,27.900,0
1,18,33.770,1
2,28,33.000,3
3,33,22.705,0
4,32,28.880,0
...,...,...,...
1333,50,30.970,3
1334,18,31.920,0
1335,18,36.850,0
1336,21,25.800,0


In [7]:
vfi = pd.DataFrame()
vfi['features'] = numeric_data.columns
vfi['VFI'] = [variance_inflation_factor(numeric_data.values,i) for i in range(numeric_data.shape[1])]
vfi.sort_values(by='VFI')

Unnamed: 0,features,VFI
2,children,1.796637
0,age,7.53655
1,bmi,7.846515


In [8]:
full = pd.get_dummies(data)
full.ndim

2

In [9]:
np.ones(full.shape[0]).reshape(-1,1)

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [10]:
full = np.hstack([full,np.ones(full.shape[0]).reshape(-1,1)])
full.shape

(1338, 12)

In [11]:
train_x,test_x,train_y,test_y = train_test_split(full,target,test_size=0.2,random_state=42)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((1070, 12), (1070,), (268, 12), (268,))

In [12]:
weights = regression_model.regression(train_x,train_y)

In [13]:
weights

array([   256.97570583,    337.09255195,    425.27878352,   -118.37875696,
         -136.9704486 , -11953.23903066,  11697.8898251 ,    395.74794285,
           25.07061663,   -262.11635371,   -414.05141133,   -255.34920556])

In [16]:
predict_1 = np.dot(test_x,weights)
predict_1

array([ 8969.55027444,  7068.74744287, 36858.41091155,  9454.67850053,
       26973.17345656, 10864.11316424,   170.28084136, 16903.45028662,
        1092.43093614, 11218.34318352, 28101.68455267,  9377.73460205,
        5263.0595179 , 38416.04221107, 40255.82339284, 37098.25353123,
       15240.39392306, 35912.88264434,  9112.52398703, 31461.92108909,
        3847.68845883, 10130.12001517,  2370.54189389,  7140.21550828,
       11301.76782638, 12961.65366224, 14509.47251876,  6159.8976107 ,
        9963.85857263,  2177.85718217,  9115.93673493, 13073.68932159,
        4561.82376202,  3408.20756033,  4459.81359745, 13032.06505076,
        1979.99357292,  8813.28303302, 33271.29124448, 32585.51583927,
        3908.76090964,  4326.10774721, 14142.81326533, 11423.45494846,
        8774.13955311, 12097.28051001,  5281.57353499,  3150.5596042 ,
       35494.46461214,  9150.1124786 , 15836.84575621,  2343.57470069,
       12364.78414194,  1482.29488266, 13389.06105161, 12573.57395972,
      

In [17]:
r2_score(test_y,predict_1)*100, mean_squared_error(test_y,predict_1)

(78.35929767120724, 33596915.851361446)

In [20]:
weights_gd,cost_history = regression_model.fit(np.sqrt(train_x),np.sqrt(train_y),0.001,10001)
weights_gd

array([ 15.74645111,   6.41794801,   4.10267011,  -0.40250128,
        -1.52602772, -44.4207607 ,  42.4922317 ,   1.58144123,
        -0.71585752,  -0.41115258,  -2.38296013,  -1.928529  ])

In [21]:
predict_2 = np.dot(np.sqrt(test_x),weights_gd)
predict_2

array([ 98.46396551,  82.17532272, 198.69449287,  97.87685281,
       143.21899292,  87.69752616,  51.62268703, 116.89383798,
        64.83866496, 103.11650099, 161.70993724,  93.53389224,
        76.64847445, 194.29618471, 201.65281452, 192.43003391,
       110.79267658, 190.64811793,  97.72825121, 175.97926913,
        69.02365163,  95.4443382 ,  53.76503488,  69.17865932,
       105.0158918 , 107.23107523, 110.86003007,  76.84546798,
       101.68461938,  53.7545403 ,  93.39990553, 107.62143815,
        62.89487344,  74.55128551,  69.28840303,  99.82813569,
        60.1032371 ,  93.31821664, 188.62372222, 175.28532025,
        62.99881078,  65.30367572, 111.20414252, 104.74323103,
        86.34580446, 108.45118186,  67.7122727 ,  68.34083161,
       187.48058048,  84.95350148, 116.48844134,  57.4042231 ,
        97.52919109,  53.31647224, 107.20782667, 107.22014197,
        72.33660853, 170.73714448, 107.710771  , 106.47323829,
       112.04059419,  87.0299552 , 118.39260409,  92.46

In [22]:
r2_score(np.sqrt(test_y),predict_2)*100, mean_squared_error(np.sqrt(test_y),predict_2)

(80.27946220684301, 478.51448142368764)