In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://cyxstorage1.blob.core.windows.net/newcontainer/insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Change to categorical values

In [3]:
def set_age(row):
    age = row['age']
    if age <= 20:
        return 1
    elif age <= 30:
        return 2
    elif age <= 40:
        return 3
    elif age <= 50:
        return 4
    elif age <= 60:
        return 5
    else:
        return 6
    
df['age_class'] = df.apply(set_age, axis=1)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age_class
0,19,female,27.900,0,yes,southwest,16884.92400,1
1,18,male,33.770,1,no,southeast,1725.55230,1
2,28,male,33.000,3,no,southeast,4449.46200,2
3,33,male,22.705,0,no,northwest,21984.47061,3
4,32,male,28.880,0,no,northwest,3866.85520,3
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,4
1334,18,female,31.920,0,no,northeast,2205.98080,1
1335,18,female,36.850,0,no,southeast,1629.83350,1
1336,21,female,25.800,0,no,southwest,2007.94500,2


## Change to dummy variables

In [4]:
dmdf = pd.get_dummies(df, columns=['age_class', 'sex', 'children', 'smoker', 'region'], drop_first=True)
dmdf = dmdf.drop(['age'], axis=1)
columns = list(dmdf.columns.values)
columns[0], columns[1] = columns[1], columns[0]
dmdf = dmdf[columns]
dmdf

Unnamed: 0,charges,bmi,age_class_2,age_class_3,age_class_4,age_class_5,age_class_6,sex_male,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northwest,region_southeast,region_southwest
0,16884.92400,27.900,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,1725.55230,33.770,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0
2,4449.46200,33.000,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0
3,21984.47061,22.705,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
4,3866.85520,28.880,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,10600.54830,30.970,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0
1334,2205.98080,31.920,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1335,1629.83350,36.850,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1336,2007.94500,25.800,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


## Train models

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

In [6]:
train, test = train_test_split(dmdf, test_size=0.2)
X_train, y_train = train.loc[:, 'bmi':'region_southwest'], train.loc[:, 'charges']
X_test, y_test = test.loc[:, 'bmi':'region_southwest'], test.loc[:, 'charges']

In [7]:
from sklearn.linear_model import LinearRegression

lreg = LinearRegression()
lreg.fit(X_train, y_train)
pred = lreg.predict(X_test)
r2 = lreg.score(X_train, y_train)
print('R-squared =', r2)

R-squared = 0.7599150514995177


In [8]:
import statsmodels.api as sm

X_train_sm = sm.add_constant(X_train)
sts = sm.OLS(y_train, X_train_sm).fit()
sts.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.76
Model:,OLS,Adj. R-squared:,0.756
Method:,Least Squares,F-statistic:,208.3
Date:,"Sat, 13 Aug 2022",Prob (F-statistic):,1.99e-312
Time:,15:08:11,Log-Likelihood:,-10798.0
No. Observations:,1070,AIC:,21630.0
Df Residuals:,1053,BIC:,21720.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6517.8376,1105.951,-5.893,0.000,-8687.956,-4347.720
bmi,337.9543,31.520,10.722,0.000,276.106,399.803
age_class_2,1201.2892,656.168,1.831,0.067,-86.256,2488.834
age_class_3,2258.9004,682.654,3.309,0.001,919.383,3598.418
age_class_4,5341.1639,659.784,8.095,0.000,4046.523,6635.805
age_class_5,8836.0067,651.831,13.556,0.000,7556.971,1.01e+04
age_class_6,1.091e+04,866.907,12.589,0.000,9212.375,1.26e+04
sex_male,-134.0244,362.880,-0.369,0.712,-846.075,578.026
children_1,1117.5470,477.746,2.339,0.020,180.104,2054.990

0,1,2,3
Omnibus:,217.116,Durbin-Watson:,2.066
Prob(Omnibus):,0.0,Jarque-Bera (JB):,495.253
Skew:,1.109,Prob(JB):,2.87e-108
Kurtosis:,5.489,Cond. No.,302.0


## Results (convert to onnx)

Since the R-squared value is high enough and I am not going to train other models such as random forest or ddep learning networks.

In [9]:
from skl2onnx import to_onnx

onx = to_onnx(lreg, X_train.loc[0, :].to_numpy().astype(np.float32))
with open('lreg.onnx', 'wb') as f:
    f.write(onx.SerializeToString())