# **Decision Tree Regressor**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='charges'),df['charges'],test_size=0.2)

In [5]:
numerical_columns = [0,2,3]
cat_nominal = [5]
cat_ordinal = [1,4]

In [6]:
impute_num = Pipeline(steps=[
    ('impute_num',SimpleImputer(strategy='mean'))
])

In [7]:
encode_nominal = Pipeline(steps=[
    ('encode_nominal',OneHotEncoder(drop='first', handle_unknown='ignore')),
    ('impute_nominal',SimpleImputer(strategy='most_frequent'))
])

In [8]:
encode_ordinal = Pipeline(steps=[
    ('encode_ordinal',OrdinalEncoder()),
    ('impute_ordinal',SimpleImputer(strategy='most_frequent'))
])

In [9]:
preprocessing = ColumnTransformer(transformers=[
    ('handle_numerical',impute_num,numerical_columns),
    ('handle_nominal',encode_nominal,cat_nominal),
    ('handle_ordinal',encode_ordinal,cat_ordinal)
],remainder='passthrough')

In [10]:
model = DecisionTreeRegressor()

In [11]:
pipe = make_pipeline(preprocessing,model)

In [12]:
pipe.fit(X_train,y_train)

In [13]:
ypred = pipe.predict(X_test)

In [14]:
from sklearn.metrics import r2_score
r2_score(y_test, ypred)

0.6321489366460957