# Classification and Regression Trees

This jupyter notebook has been made with a purpose to practice Classification and Regression Trees.

## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.style.use('dark_background')

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

## Functions

In [None]:
def accuracy(y_real, y_pred):
  return np.sum(y_real==y_pred)/len(y_real)
def MSE(y_real, y_pred):
  return np.sum((y_real-y_pred)**2)/len(y_real)

## DF_1

In [None]:
! gdown --id 1LrejAD1C4lRfaV9uYoWiroZQAjA1Rj72

Downloading...
From: https://drive.google.com/uc?id=1LrejAD1C4lRfaV9uYoWiroZQAjA1Rj72
To: /content/diabetes.csv
100% 23.9k/23.9k [00:00<00:00, 37.6MB/s]


In [None]:
df=pd.read_csv('/content/diabetes.csv')

In [None]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
df.describe(include='all')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
X=df.drop('Outcome', axis=1).copy()
y=df['Outcome'].copy()
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
model=DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

In [None]:
# accuracy for train set
np.sum(y_train==model.predict(X_train))/len(y_train)

1.0

In [None]:
# accuracy for test set
np.sum(y_test==model.predict(X_test))/len(y_test)

0.7467532467532467

As we can see we are dealing with overfitting. We didn't set up parameters of our tree, and system did for us. System did it so well that accuracy score on train set is 100%. But this fact doesn't help when we deal with test set, where accuracy equals 75%, which is quite a low score. What we need to do is to set up parameters of ```DecisionTreeClassifier```, to play with different variations of them and to pick the ones in which case accuracy score of train set is close to accuracy of test set.

In [None]:
model_2=DecisionTreeClassifier(random_state=42,
                               criterion='gini',
                               max_depth=5,
                               min_samples_split=5,
                               min_samples_leaf=5,
                               max_features=None,
                               class_weight=None)
model_2.fit(X_train, y_train)
print(f"accuracy for train set: {accuracy(y_train, model_2.predict(X_train))}")
print('')
print(f"accuracy for test set: {accuracy(y_test, model_2.predict(X_test))}")

accuracy for train set: 0.8338762214983714

accuracy for test set: 0.7922077922077922


I really tried different variations of setting up parameteres, and it looks like the best we can take. Let's look at which of our predictors was the most important

In [None]:
pd.DataFrame({'predictor': X.columns, 'importance': model_2.feature_importances_ }).sort_values('importance', ascending=False)

Unnamed: 0,predictor,importance
1,Glucose,0.507272
5,BMI,0.197812
7,Age,0.126733
6,DiabetesPedigreeFunction,0.074159
2,BloodPressure,0.043889
4,Insulin,0.026367
3,SkinThickness,0.023769
0,Pregnancies,0.0


## DF_2

In [None]:
! gdown --id 1aP0Jt9YiAKhfwf0YHvterG8YxpbLVtzS

Downloading...
From: https://drive.google.com/uc?id=1aP0Jt9YiAKhfwf0YHvterG8YxpbLVtzS
To: /content/WineQT.csv
100% 78.1k/78.1k [00:00<00:00, 2.44MB/s]


In [None]:
df=pd.read_csv('/content/WineQT.csv')

In [None]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,6,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6,1595


In [None]:
df.describe(include='all')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043,804.969379
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824,463.997116
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0,411.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0,794.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0,1209.5
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0,1597.0


In [None]:
y=df['quality'].copy()
X=df.drop('quality', axis=1).copy()
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [None]:
model=DecisionTreeRegressor(random_state=42,
                            max_depth=5,
                            min_samples_leaf=8,
                            min_samples_split=8)
model.fit(X_train, y_train)
print(f"MSE for train set: {MSE(y_train, model.predict(X_train))}")
print()
print(f"MSE for test set: {MSE(y_test, model.predict(X_test))}")

MSE for train set: 0.33207021628528294

MSE for test set: 0.395424921248077


In [None]:
pred=pd.DataFrame({'real': y, 'predicted': model.predict(X)})
pred['predicted']=pred['predicted'].apply(lambda x: round(x))
pred

Unnamed: 0,real,predicted
0,5,5
1,5,5
2,5,5
3,6,6
4,5,5
...,...,...
1138,6,6
1139,6,6
1140,5,6
1141,6,6


In [None]:
print(f"accuracy: {accuracy(pred['real'], pred['predicted'])}")
print()
print(f"MSE: {MSE(pred['real'], pred['predicted'])}")

accuracy: 0.6255468066491688

MSE: 0.4216972878390201


In [None]:
pd.DataFrame({'predictors': X.columns, 'importance': model.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,predictors,importance
10,alcohol,0.47404
1,volatile acidity,0.201452
9,sulphates,0.180027
6,total sulfur dioxide,0.044739
11,Id,0.023303
5,free sulfur dioxide,0.023107
2,citric acid,0.01717
4,chlorides,0.013638
3,residual sugar,0.011447
0,fixed acidity,0.011078
