# Decision trees for classification

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

## Loading and examining Data: Abalone Dataset

In [None]:
column_names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', header=None, names=column_names)

In [None]:
abalone_data.head()

In [None]:
abalone_data.hist(bins=50, figsize=(15,15))
plt.show()

In [None]:
plt.figure()
sns.pairplot(abalone_data, diag_kind="hist")
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(abalone_data.iloc[:, :-1].corr(), annot=True, square=True)

In [None]:
plt.figure()
sns.boxplot(data=abalone_data.iloc[:,:-1], orient="h", palette="Set2")
plt.show()

In [None]:
abalone_data.info()

In [None]:
abalone_data.describe()

In [None]:
abalone_data['Rings'].unique()

In [None]:
abalone_data['Rings'].value_counts().sort_index()

## Missing values(or 0 values)

In [None]:
(abalone_data['Height'] == 0).sum()

In [None]:
abalone_data[abalone_data['Height'] == 0]

In [None]:
means = pd.pivot_table(abalone_data, index=['Sex'], aggfunc={'Height': np.mean})
means

So we will fill the missing values with 0.107996

## Pipelining

In [None]:
X = abalone_data.iloc[:, :-1]
y = abalone_data.iloc[:, -1]

In [None]:
X[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [None]:
numeric_features = ['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight']
categorical_features = ['Sex']

In [None]:
numeric_transformer = Pipeline(steps= [("imputer", SimpleImputer(missing_values=0, strategy="constant", fill_value=0.107996)),
                                        ("scaler", StandardScaler())])

categorical_transformer= OneHotEncoder(handle_unknown="ignore")

In [None]:
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                                ("cat", categorical_transformer, categorical_features)])

In [None]:
clf = Pipeline(steps = [("preprocessor", preprocessor),
                        ("classifier", DecisionTreeClassifier(max_depth=3, random_state=42))])

In [None]:
clf.fit(X_train, y_train)
print("model score: %0.3f" % clf.score(X_test, y_test))

In [None]:
y_pred = clf.predict(X_test)

In [None]:
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), 1)
for each in comparison:
    print(each)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix \n', cm)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf, X_test, y_test, cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_test_predicted = clf.predict(X_test)
cm = confusion_matrix(y_test, y_test_predicted)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.title('Confusion matrix')
plt.show()

In [None]:
from sklearn.metrics import classification_report
CR = classification_report(y_test, y_pred)
print('Classification report \n')
print(CR)

## Cross- validation

In [None]:
from sklearn.model_selection import cross_val_score
acc = cross_val_score(estimator=clf, X = X_train, y= y_train, cv=10)
print(type(acc))
print('Accuracy of each fold', list(acc*100))
print("Accuracy: {:.2f} %".format(acc.mean()*100))

## Visualising the decision tree

In [None]:
from sklearn import tree
plt.figure(figsize=(48, 8), facecolor='w')

a = tree.plot_tree(clf['classifier'],
                    feature_names = column_names,
                    rounded = True,
                    filled = True,
                    fontsize=12)

## HP tuning

In [None]:
X_train_new = preprocessor.fit_transform(X_train)

from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth': range(1,10),
                'min_samples_split': [2, 4, 6, 8, 10]}

clf_CV = GridSearchCV(DecisionTreeClassifier(),
                        param_grid=param_grid,
                        scoring = 'recall_macro')

clf_CV.fit(X_train_new, y_train)
print('Best params: ', clf_CV.best_params_)
print('Grid score: ', clf_CV.best_score_)

# Iris Dataset

In [None]:
from sklearn.datasets import load_iris
features, labels = load_iris(return_X_y = True, as_frame=True)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler


dt_pipeline = Pipeline(steps=[("scaling", MinMaxScaler()),
                                ("clf", DecisionTreeClassifier(max_depth=3, random_state=42))])

In [None]:
dt_pipeline.fit(train_features, train_labels)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report

ConfusionMatrixDisplay.from_estimator(dt_pipeline, test_features, test_labels)

In [None]:
print(classification_report(test_labels, dt_pipeline.predict(test_features)))

In [None]:
data = load_iris()

In [None]:
from sklearn import tree
plt.figure(figsize=(20,8), facecolor="w")

a = tree.plot_tree(dt_pipeline[-1],
                    feature_names = features.columns,
                    class_names = data.target_names,
                    rounded = True,
                    filled = True)

plt.show()

In [None]:
tree_rules = tree.export_text(dt_pipeline[-1],
                        feature_names = list(features.columns))

print(tree_rules)

In [None]:
importance = pd.DataFrame({'feature': features.columns,
                            'importance': np.round(dt_pipeline[-1].feature_importances_, 3)})
importance.sort_values('importance', ascending=False, inplace=True)
print(importance)