# Comparative Study

### Investigate the performance of a number of Decision Trees vs KNN

In [348]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
figure(num=None, figsize=(8, 6), dpi=80, facecolor='w', edgecolor='k')

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

**Read the dataset mushroom.csv** 

In [None]:
training_dataset = "https://raw.githubusercontent.com/bntumb/edible-or-not/main/mushrooms.csv"
tra_data = pd.read_csv(training_dataset)

import pandas as pd
import numpy as np

tra_data['split'] = np.random.randn(tra_data.shape[0], 1)

msk = np.random.rand(len(tra_data)) <= 0.7

train_set = tra_data[msk]
hold_out_test_set = tra_data[~msk]


train_set.head()

**check for null values**

In [None]:
train_set.isna().sum()

### Pre-process data

Data preprocessing happens here 

In [351]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold


from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV



In [352]:

def pp_data(dataset):
    features = dataset.drop('class', axis=1)
    ordinal_encoder = OrdinalEncoder()
    ordinal_encoder.fit(features)
    features = ordinal_encoder.transform(features)

    label_encoder = LabelEncoder()
    label_encoder.fit( dataset['class'])
    class_labels = label_encoder.transform( dataset['class'])
    scaler = StandardScaler().fit(features)
    scaled_features = scaler.transform(features)
    sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
    selected_features =sel.fit_transform(scaled_features)
    data = dict();
    data['features'] = selected_features
    data['class']   = class_labels
    return (data)

In [353]:
scaled_training_data = pp_data(tra_data)
training_features = scaled_training_data['features']
training_class = scaled_training_data['class']

### split data

uses the sklearn model selection library to split the training data into a training set and test set. It creates a random test set using 20% amount of the samples (test_size).

In [354]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(training_features, training_class, test_size=0.20)

### Train model

In [355]:
from sklearn.model_selection import cross_val_predict


In [356]:
def train_model(classifier):
    return (classifier.fit(X_train, y_train))

### Evaluate model
this section highlights the values used to evaluate the predictions. 

In [357]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import metrics

def eval_model(prediction):
    print(confusion_matrix(y_test, prediction))
    print("Precision:",metrics.precision_score(y_test, prediction))
    print("Recall:",metrics.recall_score(y_test, prediction))
    print("accuracy: ",accuracy_score(y_test, prediction))
    print("full report")
    print(classification_report(y_test, prediction))

## Building Decision Tree Model

In [358]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf = train_model(dt_clf)


In [359]:
y_pred = dt_clf.predict(X_test)


In [None]:
plt.figure(figsize=(15,10))
tree.plot_tree(dt_clf, filled=True)

plt.show()

In [None]:
eval_model(y_pred)

### Post Prune Tree
 improve the models ability to generalise 

In [None]:
path = dt_clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
ccp_alphas

In [363]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)

In [364]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

In [365]:
dt_pruned_clf = DecisionTreeClassifier(random_state=0, ccp_alpha=0.012)
dt_pruned_clf.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.012, random_state=0)

In [None]:
pred=dt_pruned_clf.predict(X_test)
eval_model(pred)

In [None]:
plt.figure(figsize=(15,10))
tree.plot_tree(dt_pruned_clf,filled=True)

## Building KNN Classifier

In [368]:
from sklearn.neighbors import KNeighborsClassifier

In [369]:
knn_clf = KNeighborsClassifier(n_neighbors = 5)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier()

In [370]:
knn_pred = knn_clf.predict(X_test)



In [371]:
print(knn_pred)

[1 1 1 ... 1 1 1]


In [None]:
eval_model(knn_pred)

## Prediction on a hold-out test set

In [None]:
test_dataset = hold_out_test_set
test_set = pp_data(test_dataset)

test_features = test_set['features']
test_dataset.head()

In [None]:
test_set_pred = dt_pruned_clf.predict(test_features)

print(test_set_pred)

test_dataset['CLASS_PREDICTIONS'] = test_set_pred
test_dataset.to_csv ("output.csv", index=False, header=True)