# Predicting Lymphography

* insert explanation on usecase*

# DECISION TREE

In [1]:
#Decision Tree - most widely used in real-life situations
#Also useful in Linear Regression
#Same with Classification
#greedily - choose variable that best splits the data, cleanest as possible


#Min_samples_split - if less than certain  numbe rof samples, don't split
#Deeper the tree, more specific
#higher depth = more prone to overfitting

## Import Libraries

### Standard Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

%matplotlib inline

### Additional Libraries

In [3]:
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split

## Load and Explore the Dataset

<a href="https://archive.ics.uci.edu/ml/datasets/Lymphography">**Lymphography Dataset**</a>

In [None]:
#Provide the path of the dataset
df = pd.read_csv("lymphography.csv")

In [None]:
#Check if the dataset loaded correctly
df.head()

In [None]:
#Assess the dataset using .describe() function
df.describe(include="all")

In [None]:
#Check the number of features and observations in the dataset
df.shape

In [None]:
#Check the number of classes we have
df["class"].value_counts()

## Build the Model

In [None]:
#Split the features and the target column.
X = df.drop(["class"], axis=1)
y = df["class"]

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
clf = tree.DecisionTreeClassifier(criterion="gini", min_samples_split=4, min_samples_leaf=5,
            max_depth=10, random_state=25)

In [None]:
print (cross_val_score(clf, X, y, cv=5))

In [None]:
print (np.mean(cross_val_score(clf, X, y, cv=5)))

In [None]:
#Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=30)

In [None]:
#Check shape to make sure it is all in order
X_train.shape, X_test.shape, y_train.shape, y_test.shape

### Train the Model

<a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html">**Decision Tree Classifier**</a>

In [None]:
#Instantiate the Algorithm
clf = tree.DecisionTreeClassifier(criterion="gini", min_samples_split=4, min_samples_leaf=5,
            max_depth=10, random_state=25)

#Train the model
clf.fit(X_train,y_train)

In [None]:
#Contextualize and also keep in mind your parameter 

## Validate the Model

In [None]:
y_pred = clf.predict(X_test)

In [None]:
#Check the performance metrics
print("{:.2f}".format(metrics.accuracy_score(y_test,y_pred)))

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (clf, metrics.classification_report(y_test, y_pred)))

In [None]:
print("Confusion Matrix: \n%s" % metrics.confusion_matrix(y_test,y_pred))

In [None]:
#Encode Confusion Matrix into a DataFrame
labels = list(y_test.unique())
cm = metrics.confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm,index=labels, columns=labels)
cm_df

## Advanced Functions

### Feature Importance 

For evaluating the importance of features on classification task. What features contribute the most in your decision tree classifier

In [None]:
#Extract the feature importance
f_importance = clf.feature_importances_

In [None]:
#Put into a DataFrame along with Feature Names for easier understanding.
f_list = X_train.columns
df_feature_importance = pd.DataFrame(f_importance, index=f_list, columns=["Importance"])

In [None]:
#Sort the values in Descending order (Most Important -> Least Important)
#mathematicallly falls to the decision nodes 
df_feature_importance.sort_values(["Importance"],ascending=False)

In [None]:
#Visualize importance using a plot, values should be ascending for plotting purposes
df_feature_importance_asc = df_feature_importance.sort_values(["Importance"])
df_feature_importance_asc.plot(kind='barh', figsize=(10,6))

### Visualize the Decision Tree

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

In [None]:
#Get the feature/attribute columns
feature_col = X_train.columns
print (feature_col)

In [None]:
#Get the class column columns
class_col = pd.unique(y_train)
class_col = np.array(class_col)
class_col = str(class_col).replace(" ", "")
print (type(class_col),class_col)

In [None]:
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
               feature_names=feature_col,
               class_names=class_col)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())