# MSDS 7331 Data Mining: 1st decision tree model

***  

Team: Andrew Abbott, Vivek Bejugama, Patrick McDevitt, Preeti Swaminathan



### Data Loading

#### Import Required Packages

In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter('ignore',DeprecationWarning)
import seaborn as sns
import time

#### Import Data from .csv file

In [83]:
data_dir = '../data/'
data_file = 'mashable_clean_dataset_for_LR_and_SVM.csv'

file_2_read = data_dir + data_file
df = pd.read_csv(file_2_read)  

#### Strip leading spaces from column names

An initial summary of the dataset attribute follows along with the simple statistics of the numeric attributes.

In [84]:
#df.info()

In [85]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
n_tokens_title,39644.0,10.398749,2.114037,2.0,9.0,10.0,12.0,23.0
num_keywords,39644.0,7.223767,1.90913,1.0,6.0,7.0,9.0,10.0
data_channel_is_lifestyle,39644.0,0.052946,0.223929,0.0,0.0,0.0,0.0,1.0
data_channel_is_entertainment,39644.0,0.178009,0.382525,0.0,0.0,0.0,0.0,1.0
data_channel_is_socmed,39644.0,0.058597,0.234871,0.0,0.0,0.0,0.0,1.0
kw_avg_min,39644.0,312.366967,620.783887,-1.0,141.75,235.5,357.0,42827.857143
kw_max_max,39644.0,752324.066694,214502.129573,0.0,843300.0,843300.0,843300.0,843300.0
kw_avg_max,39644.0,259281.938083,135102.247285,0.0,172846.875,244572.222223,330980.0,843300.0
weekday_is_monday,39644.0,0.16802,0.373889,0.0,0.0,0.0,0.0,1.0
weekday_is_tuesday,39644.0,0.186409,0.389441,0.0,0.0,0.0,0.0,1.0


# Model Creation

## Training and Testing Split


For training and testing purposes, we use 80% of the observations for training and 20% for testing. This process is repeated three times and uses the shuffle split cross validation method built into scikit-learn.

In [86]:
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn import metrics as mt
from sklearn.svm import SVC

df_tree = df.copy()

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ...  dataframe in which to record results of model metrics
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

results_table_labels = ['n_features', 'max_depth', 'process_time', 'accuracy', 'recall', 'precision', 'f1_score']
df_results = pd.DataFrame(columns = results_table_labels)

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ...  separate X and y matrices 
# ...
# ...  convert to numpy matrices by calling 'values' on the pandas data frames
# ...  they are now simple matrices for compatibility with scikit-learn
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

if 'popular' in df_tree:
    y = df_tree['popular'].values         # set 'popular' as dependent
    del df_tree['popular']                # remove from dataset
    X = df_tree.values                    # use everything else for independent EVs

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# ...  setup cross-validation in sklearn
# ...
# ...  split into training and test sets
# ....  --> 3 folds
# ...   --> 80% / 20% training / test
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=

num_cv_iterations = 3

num_instances = len(y)

cv_object = ShuffleSplit(n_splits = num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)



***  

## c. __Decision Tree__   

*** 



First, to prepare the data for the Decision Tree classifier, the data is split into training and test using the indices in the Cross Validation object.

In [87]:
for train_indices, test_indices in cv_object.split(X, y): 
    X_train = X[train_indices]
    y_train = y[train_indices]    
    X_test = X[test_indices]
    y_test = y[test_indices]

In [88]:

print('\n----------------------------------------------------')
print('\tDecision Tree - --- --- --- --- --- --- --- ---')
print('----------------------------------------------------')
 

for idepth in range(0, 20):
    
    max_tree_depth = (idepth + 1)
    tic = time.clock()

# ...  get classifier object

    classifier_tree = tree.DecisionTreeClassifier(max_depth = max_tree_depth)

    classifier_tree = classifier_tree.fit(X_train, y_train)
    
    # ... print out process time

    toc = time.clock()
    
# ... test set predictions

    y_hat = classifier_tree.predict(X_test)

# ... model classification metrics

    acc = mt.accuracy_score(y_test, y_hat)
    rec = mt.recall_score(y_test, y_hat)
    pre = mt.precision_score(y_test, y_hat)
    f1s = mt.f1_score(y_test, y_hat)
    conf = mt.confusion_matrix(y_test, y_hat)

    print('\n----------------------------------------------------')
    print('Accuracy  = %9.3f' % acc )
    print('Recall    = %9.3f' % rec )
    print('Precision = %9.3f' % pre )
    print('F1 Score  = %9.3f' % f1s )
    print('\n----------------------------------------------------')
    print('Confusion matrix\n', conf)
    print('----------------------------------------------------')

# ... add model metrics to results data frame

    new_row = [len(df_tree.columns), max_tree_depth, toc-tic, acc, rec, pre, f1s]

    df_results.loc[len(df_results)] = new_row

    print(df_results)


----------------------------------------------------
	Decision Tree - --- --- --- --- --- --- --- ---
----------------------------------------------------

----------------------------------------------------
Accuracy  =     0.597
Recall    =     0.575
Precision =     0.603
F1 Score  =     0.588

----------------------------------------------------
Confusion matrix
 [[2450 1506]
 [1689 2284]]
----------------------------------------------------
   n_features  max_depth  process_time  accuracy   recall  precision  f1_score
0        39.0        1.0      0.088838  0.597049  0.57488   0.602639  0.588432

----------------------------------------------------
Accuracy  =     0.611
Recall    =     0.490
Precision =     0.647
F1 Score  =     0.558

----------------------------------------------------
Confusion matrix
 [[2895 1061]
 [2026 1947]]
----------------------------------------------------
   n_features  max_depth  process_time  accuracy    recall  precision  \
0        39.0        1.0 


----------------------------------------------------
Accuracy  =     0.627
Recall    =     0.607
Precision =     0.634
F1 Score  =     0.620

----------------------------------------------------
Confusion matrix
 [[2563 1393]
 [1563 2410]]
----------------------------------------------------
    n_features  max_depth  process_time  accuracy    recall  precision  \
0         39.0        1.0      0.088838  0.597049  0.574880   0.602639   
1         39.0        2.0      0.155376  0.610670  0.490058   0.647274   
2         39.0        3.0      0.210621  0.631353  0.560282   0.654321   
3         39.0        4.0      0.278757  0.630092  0.645356   0.627202   
4         39.0        5.0      0.378564  0.638416  0.584948   0.656126   
5         39.0        6.0      0.408506  0.645983  0.637553   0.649487   
6         39.0        7.0      0.501621  0.634885  0.607098   0.643887   
7         39.0        8.0      0.569203  0.642073  0.627737   0.647288   
8         39.0        9.0      0.634935 


----------------------------------------------------
Accuracy  =     0.593
Recall    =     0.566
Precision =     0.600
F1 Score  =     0.582

----------------------------------------------------
Confusion matrix
 [[2455 1501]
 [1726 2247]]
----------------------------------------------------
    n_features  max_depth  process_time  accuracy    recall  precision  \
0         39.0        1.0      0.088838  0.597049  0.574880   0.602639   
1         39.0        2.0      0.155376  0.610670  0.490058   0.647274   
2         39.0        3.0      0.210621  0.631353  0.560282   0.654321   
3         39.0        4.0      0.278757  0.630092  0.645356   0.627202   
4         39.0        5.0      0.378564  0.638416  0.584948   0.656126   
5         39.0        6.0      0.408506  0.645983  0.637553   0.649487   
6         39.0        7.0      0.501621  0.634885  0.607098   0.643887   
7         39.0        8.0      0.569203  0.642073  0.627737   0.647288   
8         39.0        9.0      0.634935 

In [62]:
type(list(set(df.popular)))

y_hat

y_train

array([False, False, False, ...,  True, False,  True], dtype=bool)

In [59]:
type(df_tree.columns)

pandas.core.indexes.base.Index

In [67]:
import graphviz 

dot_data = tree.export_graphviz(classifier_tree, out_file = None) 

graph = graphviz.Source(dot_data) 

graph.render("mashable") 

dot_data = tree.export_graphviz(classifier_tree, out_file = None, 
                        feature_names = df_tree.columns,
                        class_names = ['popular', 'not popular'],
                        filled = True, rounded = True,  
                        special_characters = True)  
graph = graphviz.Source(dot_data)  

graph.render("mashable")


'mashable.pdf'