In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Weather dataset

A small categorical dataset: depending on the weather, will you go play outside or not ?

In [None]:
weather = pd.read_csv("weather.csv")
weather

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


In [None]:
X = weather.drop("play", axis=1) # Set to X the dataframe without the column "play"
y = weather["play"]

In [None]:
X[:5]

Unnamed: 0,outlook,temperature,humidity,windy
0,sunny,hot,high,False
1,sunny,hot,high,True
2,overcast,hot,high,False
3,rainy,mild,high,False
4,rainy,cool,normal,False


In [None]:
y[:5]

0     no
1     no
2    yes
3    yes
4    yes
Name: play, dtype: object

# Classifying the dataset
## 1. Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier(criterion="entropy")

tree.fit(X, y)

ValueError: ignored

### ValueError: we need to transform the dataset first

### Preprocessing the data

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
enc = OneHotEncoder(sparse=False)

X_transf = enc.fit_transform(X)

new_col_names = enc.get_feature_names_out(X.columns)
X_transf = pd.DataFrame(X_transf, columns=new_col_names)

X_transf.head()



Unnamed: 0,outlook_overcast,outlook_rainy,outlook_sunny,temperature_cool,temperature_hot,temperature_mild,humidity_high,humidity_normal,windy_False,windy_True
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


### Training a decision tree on the transformed data

In [None]:
tree.fit(X_transf, y)

y_pred = tree.predict(X_transf)

acc = (y_pred == y).mean()
print("accuracy:", acc)

accuracy: 1.0


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_transf, y, test_size=5, random_state=42)

tree = DecisionTreeClassifier(criterion="entropy")
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

acc = (y_pred == y_test).mean()
print("accuracy:", acc)

accuracy: 0.6


## 2. Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder

### Preprocessing

In [None]:
oe_enc = OrdinalEncoder()

X_oe = oe_enc.fit_transform(X)

X_oe = pd.DataFrame(X_oe, columns=X.columns)
X_oe.head()

Unnamed: 0,outlook,temperature,humidity,windy
0,2.0,1.0,0.0,0.0
1,2.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0
3,1.0,2.0,0.0,0.0
4,1.0,0.0,1.0,0.0


In [None]:
nb = CategoricalNB()

nb.fit(X_oe, y)

y_pred = nb.predict(X_oe)

acc = (y_pred == y).mean()
print("accuracy:", acc)

accuracy: 0.9285714285714286


# Exercise 2: Breast cancer data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, tree, metrics
from sklearn.model_selection import train_test_split
from sklearn import ensemble


# Calculates the per-class accuracy given predicted and true output labels.
def class_accs(y_pred, y_true):
    acc0 = ((y_pred == y_true) & (y_true == 0)).sum() / (y_true == 0).sum()
    acc1 = ((y_pred == y_true) & (y_true == 1)).sum() / (y_true == 1).sum()
    return acc0, acc1

# Prints a summary of performance metrics given predicted and true output labels.
def print_metrics(y_pred, y_true):
    f1 = metrics.f1_score(y_true, y_pred)
    acc = metrics.accuracy_score(y_true, y_pred)
    acc0, acc1 = class_accs(y_pred, y_true)
    print(f'\tF1 = {f1}')
    print(f'\tAccuracy = {acc}')
    print(f'\t\tclass 0: {acc0}')
    print(f'\t\tclass 1: {acc1}')

Step one: investigate the data

Investigate the datatype and values of the data that we load in here.
What are the features? What data type is it? What type of data is in the input and target data sets?

In [None]:
# load the Wisconsin breast cancer data set
X, y = datasets.load_breast_cancer(as_frame=True, return_X_y=True)

Here is a small test of the decision tree model and the metric calculations for this data set. what conclusions can you draw from this? Is this split a good idea? how can we improve it?

In [None]:
X_train = X[:300]
y_train = y[:300]
X_test = X[300:]
y_test = y[300:]

dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print_metrics(y_pred, y_test)

	F1 = 0.9238845144356955
	Accuracy = 0.8921933085501859
		class 0: 0.9696969696969697
		class 1: 0.8669950738916257


In [None]:
accuracy0, accuracy1 = class_accs(y_pred, y_test)
bal_acc = (accuracy0 + accuracy1)/2
print('Balanced accuracy = {}'.format(bal_acc))

Balanced accuracy = 0.9183460217942977


Cross-validation is a usefull way to reuse data and to get a more accurate representation of how well your clasifier works on your data set. Here we will implement it to do some hyperparameter tuning.

What does the following function do? Why would


Additional exercise: is there a quicker way to do this (hint check the documentation of train_test_split)

In [None]:
def cross_validation(X, y, folds, clf):
    # We create separate folds for positive/negative samples, to ensure
    # that each fold has the same proportion of positive/negative samples
    X_pos = X.iloc[y[y == 1].index]
    X_pos = X_pos.reset_index(drop=True)
    X_neg = X.iloc[y[y == 0].index]
    X_neg = X_neg.reset_index(drop=True)


    pos_fold_size = y.sum() // folds
    neg_fold_size = (1-y).sum() // folds
    bal_accs_val = []
    bal_accs_train = []
    for i in range(folds):
        # val set consist of the i-th fold

        val_pos = X_pos.iloc[i * pos_fold_size: (i + 1) * pos_fold_size, :]
        val_neg = X_neg.iloc[i * neg_fold_size: (i + 1) * neg_fold_size, :]
        X_val = pd.concat([val_pos, val_neg])
        y_val = pd.Series([1] * pos_fold_size + [0] * neg_fold_size)


        # Train set consists of the remaining folds
        train_pos = pd.concat([X_pos.iloc[:i * pos_fold_size, :], X_pos.iloc[(i + 1) * pos_fold_size + 1:, :]])
        train_neg = pd.concat([X_neg.iloc[:i * neg_fold_size, :], X_neg.iloc[(i + 1) * neg_fold_size + 1:, :]])
        X_train = pd.concat([train_pos, train_neg])
        y_train = pd.Series([1] * train_pos.shape[0] + [0] * train_neg.shape[0])


        # Fit the classifier and calculate balanced accuracy
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        acc0, acc1 = class_accs(y_pred, y_val)
        bal_acc_val = (acc0 + acc1) / 2

        bal_accs_val.append(bal_acc_val)

        y_pred_train = clf.predict(X_train)
        acc0, acc1 = class_accs(y_pred_train, y_train)
        bal_acc_train = (acc0 + acc1) / 2

        bal_accs_train.append(bal_acc_train)
        print(f"Fold {i+1}: {bal_acc_train:.3f} (train)")
        print(f"Fold {i+1}: {bal_acc_val:.3f} (validation)")
    print(f"Avg balanced accuracy valiation: {np.mean(bal_accs_val):.3f}")
    print(f"Avg balanced accuracy training set: {np.mean(bal_accs_train):.3f}")
    print()
    return np.mean(bal_accs_val), np.mean(bal_accs_train)

What is the most important hyperparameter of a decision tree?

Vary this hyperparameter to see how the model behaves? What can you say about this graph this also.

As the tree depth increases, it will be able to make more accurate desicions. However if you allow the tree to grow to deep, it will overfit on the data. This can easely be seen on the trainig acc curve. After a depth of about 8-9, the tree has perfectly learned the trainig data set. Allowin the tree to grow deeper will not result in a better performance. Therefore it may be better to cut of the training process early.

## Additional exercise Random Forest and Gaussian Naive Bayes

For more complex data set, it may be interesting to use a ensemble method such as a random forrest. Instead of of jut using one tree, this allows you to train multiple trees on the same data set and then average out the respons of the trees for your test data. (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier)

Additionally we can once again use a Naive bayes model, however the categorical NB, is not ideal for this data set. Why?

Instead we will use the Gaussian Naive Bayes model. (https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB)

Add here some code to test implement these two models and print out the metrics.