In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
diab = load_diabetes()

In [3]:
df = pd.DataFrame(diab.data)

In [4]:
df.sample(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
377,0.019913,0.05068,0.009961,0.01843,0.014942,0.044719,-0.061809,0.07121,0.009434,-0.063209
298,0.023546,0.05068,-0.037463,-0.046985,-0.091006,-0.07553,-0.032356,-0.039493,-0.030748,-0.013504
127,0.034443,0.05068,-0.001895,-0.012556,0.038334,0.013717,0.078093,-0.039493,0.004548,-0.096346
296,0.067136,-0.044642,-0.061174,-0.040099,-0.026336,-0.024487,0.033914,-0.039493,-0.056153,-0.059067
186,-0.081798,0.05068,0.042296,-0.019442,0.03971,0.057558,-0.069172,0.108111,0.04719,-0.038357


In [5]:
df.columns =  diab.feature_names
df['DPM'] = diab.target

In [6]:
df.sample(5)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,DPM
253,0.081666,-0.044642,0.033673,0.008101,0.052093,0.056619,-0.017629,0.034309,0.034866,0.069338,150.0
319,0.019913,-0.044642,0.004572,0.045972,-0.01808,-0.054549,0.063367,-0.039493,0.028658,0.061054,191.0
402,0.110727,0.05068,-0.033151,-0.022885,-0.004321,0.020293,-0.061809,0.07121,0.015568,0.044485,168.0
256,-0.049105,-0.044642,0.160855,-0.046985,-0.029088,-0.01979,-0.047082,0.034309,0.02802,0.011349,346.0
66,-0.009147,0.05068,-0.018062,-0.033213,-0.020832,0.012152,-0.072854,0.07121,0.000272,0.019633,150.0


### we need to predict DPM (disease progression measurement) using decision tress ###

In [7]:
x = df.iloc[:,:10]

In [8]:
x.shape

(442, 10)

In [9]:
y = df.iloc[:,10]

In [10]:
y.shape

(442,)

### doing train test split on the full data we have ###

In [11]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2, random_state=42)

### using the decision tree classifier of scikitlearn with the loss function to be squared error and the maximum depts of the tree to be 5 ###

In [12]:
dt = DecisionTreeRegressor(max_depth=5, criterion="squared_error")

In [13]:
dt.fit(x_train,y_train)

### using cross validation score to check the correct or more accurate r2 score ###

In [14]:
scores = cross_val_score(dt, x_train, y_train, cv=5, scoring='r2')

In [15]:
scores.mean()

np.float64(0.18687790528965315)

In [16]:
y_pred = dt.predict(x_test)

In [17]:
from sklearn.metrics import r2_score

In [18]:
r2_score(y_test,y_pred)

0.35087734238235124

### using the grid search cv to find the best parameters for the decision tree classifier (a type of hyper parameter tuning) ###

In [19]:
params = {
    'max_depth':[4,5,9,10,None],
    'criterion':['squared_error','absolute_error'],
    'max_features':[0.25,0.5,1.0],
    'min_samples_split':[0.25,0.5,1.0]
}

In [20]:
reg = GridSearchCV(dt,param_grid=params)

In [21]:
reg.fit(x_train,y_train)

### the best r2 score calculated after using grid search cv ###

In [22]:
reg.best_score_

np.float64(0.3291812823242634)

### the best parameter w.r.t params 

In [23]:
reg.best_params_

{'criterion': 'squared_error',
 'max_depth': 5,
 'max_features': 0.5,
 'min_samples_split': 0.25}