In [1]:
# Load Libraries

In [2]:
import os
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
import graphviz
from sklearn.tree import export_graphviz    
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV


#   Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
# Load Modules

In [4]:
os.chdir(r'/home/ccirelli2/Desktop/Docket-Sheet-Classification/Modules')
import Step4_Module_Machine_Learning_Algorithms as stp4

In [5]:
#   Simple Decision Tree

def simple_decision_tree(Features, Targets, Max_Depth , TrainTest = 'Train', Score = 'Precision'):
    X_train, X_test, y_train, y_test = train_test_split(Features, Targets)
    clf = DecisionTreeClassifier(max_depth = Max_Depth, random_state = 50)
    
    # Fit Algorithm
    clf.fit(X_train, y_train)
    
    #   Training Set
    clf_pred = clf.predict(X_train)
    report_train = sklearn.metrics.classification_report(y_train, clf_pred)
    matrix_train = sklearn.metrics.confusion_matrix(y_train, clf_pred)
    accuracy_train = sklearn.metrics.accuracy_score(y_train, clf_pred)
    
    # Test
    clf_pred = clf.predict(X_test)
    report_test = sklearn.metrics.classification_report(y_test, clf_pred)
    matrix_test = sklearn.metrics.confusion_matrix(y_test, clf_pred)
    accuracy_test = sklearn.metrics.accuracy_score(y_test, clf_pred)
        
  
    # Print Results
            
    if Score == 'Accuracy':
        if TrainTest == 'Train':
            Accuracy_score = round(accuracy_train, 2)
            return Accuracy_score
        
        elif TrainTest == 'Test':
            Accuracy_score = round(accuracy_test, 2)
            return Accuracy_score
        
    # Break

In [27]:
def make_predictions_decisionTree(Target_dir, Depth = 8, Ngram = 'Nograms', Write2Excel = False, Destination_location = None):
    '''Documentation
    
    Input   = i.) Target_dir = location where our docketsheet key word frequencies are located. 
     
    '''
    # Dictionary to house values
    Dict = {}
    
    # Change Directory
    os.chdir(Target_dir)
    
    #Loop over files
    for file in os.listdir():
        
        # If Ngram in file
        if Ngram in file:
            # Mark start of process
            print('Generating prediction for =>', '\n', file, '\n')
            # Get Features & Targets
            Features = stp4.get_feature_target_dataframes(file, dataset = 'Features')
            Targets  = stp4.get_feature_target_dataframes(file, dataset = 'Targets')
            # Generate Prediction
            Accuracy_train = simple_decision_tree(Features, Targets, 
                                                  Max_Depth = Depth, TrainTest = 'Train', 
                                                  Score = 'Accuracy')
            
            Accuracy_test = simple_decision_tree(Features, Targets, 
                                                  Max_Depth = Depth, TrainTest = 'Test', 
                                                  Score = 'Accuracy')
            
            # Create Dictionary
            Dict[file] = (Accuracy_train, Accuracy_test)
    
    # Create Dataframe
    df = pd.DataFrame(Dict)
    df_transpose = df.transpose()
    df_rename_cols = df_transpose.rename(index = str, columns = {0: 'Accuracy_train', 
                                                                 1: 'Accuracy_test'}) 
    df_final = df_rename_cols.sort_values(by = 'Accuracy_test', ascending = False)
    
    # Write to Excel
    if Write2Excel == True:
        print('Writing dataframe to Excel')
        os.chdir(Destination_location)
        File_name = 'Decision Tree Results for' + '_' + Ngram
        print('File name => ' + File_name)
        stp4.write_to_excel(df_final, Destination_location, File_name)
        print('Your file has been saved to =>  ', Destination_location, '\n', '\n')
        # Otherwise, return the dataframe to the user.    
    else:
        return df_final

In [30]:
Target_dir = r'/home/ccirelli2/Desktop/Docket-Sheet-Classification/Results_Docketsheet_wordMatches'
Destination = r'/home/ccirelli2/Desktop/Docket-Sheet-Classification/Results_ML_Models'
Ngram_options = ['Nograms', 'Bigrams', 'Trigrams', 'Quadgrams']

for option in Ngram_options:
    Prediction  = make_predictions_decisionTree(Target_dir, 
                                                Depth = 15, 
                                                Ngram = option, 
                                                Write2Excel = True, 
                                                Destination_location = Destination)


Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationII_AVG_not_zero_Top15_highest_STDV.xlsx.xlsx 



  'precision', 'predicted', average, warn_for)


Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationII_AVG_not_zero_Top5_highest_STDV_AVG_below_20prct.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationIII_Correlation_Coefficient_Top15_highest_COCOEF.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationII_AVG_not_zero_Top5_lowest_STDV_highest_AVG.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationII_AVG_not_zero_Top5_highest_STDV_lowest_AVG.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationI_homebrew_STDV_Top5_highest_STDV_AVG_below_20prct.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationI_homebrew_STDV_Top5_highest_STDV_lowest_AVG.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationIII_Correlation_Coefficient_Top5_lowest_COCOEF_highest_AVG.xl

  'recall', 'true', average, warn_for)


Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Bigrams_CalculationII_AVG_not_zero_Top15_highest_STDV.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Bigrams_CalculationII_AVG_not_zero_Top5_lowest_STDV_highest_AVG.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Bigrams_CalculationI_homebrew_STDV_Top15_highest_STDV.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Bigrams_CalculationIII_Correlation_Coefficient_Top5_lowest_COCOEF_highest_AVG.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Bigrams_CalculationI_homebrew_STDV_Top5_highest_STDV_AVG_below_20prct.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Bigrams_CalculationI_homebrew_STDV_Top5_lowest_STDV_highest_AVG.xlsx.xlsx 

Writing dataframe to Excel
File name => Decision Tree Results for_Bigrams
Your file has been saved to =>   /home/ccirelli2/Desktop/Docket-Sheet-Classification/R

In [14]:
'''OBSERVATIONS

Feature_selection:      A depth of 30 appears to provide the best results. 
Precision & Recall:     The best so far as been .89 and .88. 
Stages:                 Stage 4 and 8 continue to perform the worst. 
Stage4:                 Incorrectly predicts stages 7 and 8.
                        Overlap:  only 1 word overlaps Stages 4 and 7 'Civil'. 
Stage8:                 Incorrectly predicts 4 and 5. 
                        Overlap:  There is significant overlap between 8 and 4. Three words overlap in Word Group 2, which 
                        is where the AVG is between 1-2% and CV is the highest. Two words overlap in Word Group 3, which 
                        is our contrarian word group. 

Overall                 VAR is not a good measurement for identifying the top 5 words for our stage 1 and 2 as it removes
                        the sign from our frequencies such that a word with a large negative deviation could get put into
                        this group.  If we were only grabbing the top 15 words using VAT and or STDV we would probably be 
                        ok.  
                        AVG of all other time periods runs into issues as many of the columns may have a 0% but others 
                        could have a frequency at or higher than our target time period.  Therefore, we might think about
                        eliminating the columns with 0% when calculating our average.  This would force the code to 
                        recognize a higher average for the other time periods. 

Thoughts:               1.) Amend the Top5 selection code to calculate the AVG not using any of the Stages with 0%. 
                        2.) Revert back to the old calculation which was ((Target/Avg)*Target) as it is sign sensitive    
'''

"OBSERVATIONS\n\nFeature_selection:      A depth of 30 appears to provide the best results. \nPrecision & Recall:     The best so far as been .89 and .88. \nStages:                 Stage 4 and 8 continue to perform the worst. \nStage4:                 Incorrectly predicts stages 7 and 8.\n                        Overlap:  only 1 word overlaps Stages 4 and 7 'Civil'. \nStage8:                 Incorrectly predicts 4 and 5. \n                        Overlap:  There is significant overlap between 8 and 4. Three words overlap in Word Group 2, which \n                        is where the AVG is between 1-2% and CV is the highest. Two words overlap in Word Group 3, which \n                        is our contrarian word group. \n\nOverall                 VAR is not a good measurement for identifying the top 5 words for our stage 1 and 2 as it removes\n                        the sign from our frequencies such that a word with a large negative deviation could get put into\n                     