In [11]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Iris Dataset

In [3]:
data = datasets.load_iris()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

model = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy', accuracy_score(y_pred, y_test))

Accuracy 0.9666666666666667


# Breast Cancer Dataset

In [4]:
data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy', accuracy_score(y_pred, y_test))

Accuracy 0.9210526315789473


Tree models like decision tree don't need feature scaling to operate effectively. It used threshold for the split decision, unlike KNN and SVM that used distance metric. 

# Wine Quality Dataset

In [7]:
red = pd.read_csv('datasets/winequality-red.csv', sep=';')
white = pd.read_csv('datasets/winequality-white.csv', sep=';')

combined = pd.concat([red, white])
combined.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Classification Approach

just like other algorithms, wine quality can be approached from both classification and regression problem. For the classification approach, we are going to make it into binary classification problem by changing quality > 6 as 1 and quality < 7 as 0.

In [8]:
X = combined.drop('quality', axis=1)
y = (combined['quality']>6).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy', accuracy_score(y_pred, y_test))

Accuracy 0.8161538461538461


## Regression Approach

for the regression, the quality will also be the target variable

In [13]:
X = combined.drop('quality', axis=1)
y = combined['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

model = DecisionTreeRegressor(criterion='squared_error', max_depth=3, random_state=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(f"R2, {r2_score(y_test, y_pred):2f}")
print('MSE', mean_squared_error(y_test, y_pred))

R2, 0.230809
MSE 0.5595392585130958


just like the results on the other notebooks, the performance are better for the classification approach than the regression