# Decision Tree

## Base implementation

In [12]:
#Importing modules
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
import warnings
from sklearn.model_selection import GridSearchCV
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as make_pipeline_imb

In [13]:
# Load the datasets
df_train = pd.read_csv('../data/training.csv', index_col=0)

X_train = df_train['article_words'].values
y_train = df_train['topic'].values

assert(len(X_train) == 9500)
assert(len(y_train) == 9500)

In [14]:
labels = set(y_train)
labels.remove('IRRELEVANT')
custom_f1 = make_scorer(f1_score, labels=list(labels), average='macro')

scorer = {
    'accuracy': 'accuracy',
    'precision': make_scorer(precision_score, labels=list(labels), average='micro'),
    'recall': make_scorer(recall_score, labels=list(labels), average='micro'),
    'f1': make_scorer(f1_score, labels=list(labels), average='micro'),    
    'precision_macro': make_scorer(precision_score, labels=list(labels), average='macro'),
    'recall_macro': make_scorer(recall_score, labels=list(labels), average='macro'),
    'f1_macro': make_scorer(f1_score, labels=list(labels), average='macro'),
    'custom': custom_f1,
}

### Setup the pipeline

In [18]:
clf = make_pipeline_imb(
    TfidfVectorizer(),
    SMOTE(random_state=0,n_jobs=-1),
    DecisionTreeClassifier( criterion='gini',random_state=0, min_samples_split = 15)
)

In [22]:


parameters = {
    
    'tfidfvectorizer__max_features': [20_000],
    'tfidfvectorizer__max_df': [0.5],
    'tfidfvectorizer__sublinear_tf': (True, False),
    'tfidfvectorizer__lowercase': (True, False),
    'decisiontreeclassifier__min_samples_leaf':[10,12,15,20],
    'decisiontreeclassifier__min_samples_split':[2,4,6],
    'decisiontreeclassifier__splitter':('best', 'random')
}


In [23]:
gs_clf = GridSearchCV(clf, 
                     parameters, 
                     cv=3, 
                     n_jobs=-1, 
                     verbose = 5, 
                     scoring=scorer, 
                     refit='custom', 
                     return_train_score=True)

gs_clf.fit(X_train, y_train)



Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 53.7min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 86.0min
[Parallel(n_jobs=-1)]: Done 288 out of 288 | elapsed: 87.7min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidfvectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                               

In [24]:
print(f'Best cross-validation score: {gs_clf.best_score_:.2f}')
print(f'Best params: {gs_clf.best_params_}')

Best cross-validation score: 0.48
Best params: {'decisiontreeclassifier__min_samples_leaf': 15, 'decisiontreeclassifier__min_samples_split': 2, 'decisiontreeclassifier__splitter': 'best', 'tfidfvectorizer__lowercase': True, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__max_features': 20000, 'tfidfvectorizer__sublinear_tf': False}


### Refitting the best estimatory by grid search

In [27]:
best_clf = gs_clf.best_estimator_
best_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.5, max_features=20000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 toke...
                       sampling_strategy='auto', svm_estimator='deprecated')),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                

### Saving the results to a dataframe

In [35]:
import pickle
model_name ='decision_tree'

best_idx = gs_clf.cv_results_['params'].index(gs_clf.best_params_)

d = {
    'model': model_name,
}

for metric in scorer:
    for dataset in ['train', 'test']:
        for stat in ['mean', 'std']:
            metric_item = f'{stat}_{dataset}_{metric}'
            d[metric_item] = gs_clf.cv_results_[metric_item][best_idx]

df_models_clf = pd.DataFrame(d, index=[0])

with open(f'/Users/ashish/Desktop/UNSW/COMP9417/Project/COMP9417-project/data/pickles/best_{model_name}.pickle', 'wb') as output:
    pickle.dump(best_clf, output)
    
with open(f'/Users/ashish/Desktop/UNSW/COMP9417/Project/COMP9417-project/data/pickles/df_results_{model_name}.pickle', 'wb') as output:
    pickle.dump(df_models_clf, output)
    
df_models_clf

Unnamed: 0,model,mean_train_accuracy,std_train_accuracy,mean_test_accuracy,std_test_accuracy,mean_train_precision,std_train_precision,mean_test_precision,std_test_precision,mean_train_recall,...,mean_test_recall_macro,std_test_recall_macro,mean_train_f1_macro,std_train_f1_macro,mean_test_f1_macro,std_test_f1_macro,mean_train_custom,std_train_custom,mean_test_custom,std_test_custom
0,decision_tree,0.779736,0.00795,0.657263,0.003263,0.696194,0.006954,0.531755,0.005636,0.820184,...,0.56114,0.010307,0.694542,0.010679,0.481532,0.00229,0.694542,0.010679,0.481532,0.00229
