In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import set_config
set_config(display='diagram')

In [2]:
import pandas as pd

df = pd.read_csv('claim.csv', index_col=0)
df.head()

Unnamed: 0,age,gender,bmi,smoker,claim
0,39.0,male,23.2,No,1121.87
1,24.0,male,30.1,No,1131.51
2,38.078652,male,33.3,No,1135.94
3,38.078652,male,33.7,No,1136.4
4,38.078652,male,34.1,No,1137.01


In [3]:
transformer = ColumnTransformer(transformers=[
    ('tnf1', OneHotEncoder(),[1,3]),
    ('tnf2',StandardScaler(),[0,2])
], remainder='passthrough')

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
df.drop(columns=['claim'])

Unnamed: 0,age,gender,bmi,smoker
0,39.000000,male,23.2,No
1,24.000000,male,30.1,No
2,38.078652,male,33.3,No
3,38.078652,male,33.7,No
4,38.078652,male,34.1,No
...,...,...,...,...
1335,44.000000,female,35.5,Yes
1336,59.000000,female,38.1,Yes
1337,30.000000,male,34.5,Yes
1338,37.000000,male,30.4,Yes


In [6]:
xtrain,xtest,ytrain,ytest = train_test_split(df.drop(columns=['claim']), df['claim'], test_size=0.2,random_state=1)

In [7]:
xtrain

Unnamed: 0,age,gender,bmi,smoker
216,26.0,male,25.7,No
731,29.0,male,38.6,No
866,59.0,female,33.5,No
202,52.0,female,33.1,No
820,43.0,female,28.6,No
...,...,...,...,...
715,42.0,male,21.4,No
905,58.0,female,24.0,No
1096,34.0,female,26.9,Yes
235,58.0,female,27.7,No


In [8]:
transformer

In [9]:
model = Pipeline(steps=[('transformer', transformer),('model',GradientBoostingRegressor(learning_rate=0.5))])

In [10]:
xtrain

Unnamed: 0,age,gender,bmi,smoker
216,26.0,male,25.7,No
731,29.0,male,38.6,No
866,59.0,female,33.5,No
202,52.0,female,33.1,No
820,43.0,female,28.6,No
...,...,...,...,...
715,42.0,male,21.4,No
905,58.0,female,24.0,No
1096,34.0,female,26.9,Yes
235,58.0,female,27.7,No


In [11]:
model.fit(xtrain,ytrain)

In [12]:
xtest

Unnamed: 0,age,gender,bmi,smoker
559,54.0,female,34.6,No
1089,45.0,female,30.1,No
1021,19.0,male,24.0,Yes
460,31.0,male,29.4,No
802,42.0,male,49.1,No
...,...,...,...,...
682,27.0,female,46.1,No
629,30.0,male,22.5,No
893,39.0,female,26.5,No
807,51.0,female,25.7,No


In [13]:
ypred = model.predict(xtest)

ypred

array([ 3943.2274185 , 10759.25930199, 20075.4676513 ,  7028.28631529,
        4862.4666063 , 12476.40310712,  7691.8948791 , 14492.24354392,
        8549.19219035,  7084.11472219,  6850.36677319,  5858.91690724,
       24708.7972755 ,  7742.10283197,  7551.74711216, 24099.96157404,
        3767.67746122,  5216.2678698 , 11377.60229872,  6468.06055031,
       10192.15886297,  9061.17197353,  5198.09044955,  9640.5937961 ,
       10001.52035264,  8286.71135812,  6075.00212117,  7539.31645167,
        6405.00465013,  5169.80529684,  5496.95822036,  2315.86853035,
        6591.860842  ,  6838.76701716, 20460.55788727,  8069.23194039,
        7416.79052743,  5681.41490256, 11004.44914043, 33551.0937574 ,
        6807.46256664,  8046.1150969 , 10355.44173383, 10019.1848348 ,
        2977.71156789,  5549.70630579,  6162.85244074,  6127.88586371,
        5728.81475087, 15814.69995062,  2897.09263746,  9921.39207292,
        5480.03720893, 42577.26805652, 19456.41404098,  6580.71294197,
      

In [14]:
model

In [15]:
import pickle

pickle.dump(model, open('model.pkl','wb'))

In [None]:
#python -m streamlit run app.py