In [1]:
# Load Libraries

In [2]:
import os
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
import graphviz
from sklearn.tree import export_graphviz    
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV


#   Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
# Load Modules

In [4]:
os.chdir(r'/home/ccirelli2/Desktop/Docket-Sheet-Classification/Modules')
import Step4_Module_Machine_Learning_Algorithms as stp4

In [5]:
# MAKE PREDICTIONS

In [10]:
def make_predictions_decisionTree(Target_dir, Depth = 8, KeyWord = 'Nograms', Write2Excel = False, 
                                  Destination_location = None, Iterable = True, Single_File = None, 
                                  Metric = 'Accuracy', TestTrain = None):
    '''Documentation
    
    Input:      i.)    Target_dir  = location where our docketsheet key word appearance dataframes are located. 
                ii.)   Depth       = the depth that we want to use for our tree.  If not specified default 
                                     to 8. 
                iii.)  Write2Excel = if we want to write to Excel or work with the results in memory. 
                                     this feature is not yet set up for the confusion matrix or class report. 
                iv.)   Destination = where we want to write our results to. 
                v.)    Iterable    = whether we are working with a single or multiple files. 
                vi.)   Single_file = if we chose False for the Iterable, then we will need to specify the 
                                     file we want to use. 
                vii.)  Metric      = the metric that we want to use to guage the performance of our model. 
                                     default to 'Accuracy'.  Can also chose 'Matrix' to return the confusion
                                     matrix. 
                viii.) KeyWord     = Choose the key word that you want to use to group the files (approachs)
                                     to be used in the ML model. Examples include using the names of the 
                                     ngrmas ('Bigrams') or it could be STDV vs COCOEF, etc. 
                                     
    Operations i.)     The main operation here is either to iterate a list of files in a directory to 
                       generate the predictions or to work with one file.  That and the code is set up so 
                       that the user can have various choices as can be inferred from the input explanations. 
                       
    '''
    # Dictionary to house values
    Dict = {}

    # Change Directory
    os.chdir(Target_dir)
    
    # If you are looking to iterate over an entire directory of files
    if Iterable == True:
    
        #Loop over files
        for file in os.listdir():
        
            # Choose the key word to group the files that are chosen by the code.  
            if KeyWord in file:
                # Mark start of process
                print('Generating prediction for =>', '\n', file, '\n')
                # Create the Feature & Target dataframes. 
                Features = stp4.get_feature_target_dataframes(file, dataset = 'Features')
                Targets  = stp4.get_feature_target_dataframes(file, dataset = 'Targets')
                # Generate The Training Results
                Accuracy_train = stp4.simple_decision_tree(Features, Targets, 
                                                  Max_Depth = Depth, TrainTest = 'Train', 
                                                  Metric = Metric)
                # Generate the Test Results. 
                Accuracy_test = stp4.simple_decision_tree(Features, Targets, 
                                                  Max_Depth = Depth, TrainTest = 'Test', 
                                                  Metric = Metric)
            
                # Add your results to the dictionary object using file name as the key. 
                Dict[file] = (Accuracy_train, Accuracy_test)
    
    # If the user only wants to work with one file. 
    elif Iterable == False:
        
        # Mark start of process
        print('Generating prediction for =>', '\n', Single_File, '\n')
        # Get Features & Targets
        Features = stp4.get_feature_target_dataframes(Single_File, dataset = 'Features')
        Targets  = stp4.get_feature_target_dataframes(Single_File, dataset = 'Targets')
        # Generate Prediction
        Accuracy_train = simple_decision_tree(Features, Targets, 
                                                  Max_Depth = Depth, TrainTest = 'Train', 
                                                  Metric = Metric)
        Accuracy_test = simple_decision_tree(Features, Targets, 
                                                  Max_Depth = Depth, TrainTest = 'Test', 
                                                  Metric = Metric)
        # Append results to a dictionary object. 
        Dict[Single_File] = (Accuracy_train, Accuracy_test)

    # Create Dataframe from dictionary object. 
    df = pd.DataFrame(Dict)
    df_transpose = df.transpose()
    # Define the column names.  In the future, if we add more measures, we can change this. 
    df_rename_cols = df_transpose.rename(index = str, columns = {0: 'Accuracy_train', 
                                                                 1: 'Accuracy_test'}) 
    df_final = df_rename_cols.sort_values(by = 'Accuracy_test', ascending = False)
    
    # Write to Excel
    if Write2Excel == True:
        print('Writing dataframe to Excel')
        os.chdir(Destination_location)
        File_name = 'Decision Tree Results for' + '_' + Ngram
        print('File name => ' + File_name)
        stp4.write_to_excel(df_final, Destination_location, File_name)
        print('Your file has been saved to =>  ', Destination_location, '\n', '\n')
    # If the user does not want to write to Excel return to them the dataframe in memory.     
    else:    
        print('Results', '\n', df_final)
        return df_final

In [None]:
# WORKING WITH SINGLE FILES

In [29]:
Target_dir = r'/home/ccirelli2/Desktop/Docket-Sheet-Classification/Results_Docketsheet_wordMatches'
Destination = r'/home/ccirelli2/Desktop/Docket-Sheet-Classification/Results_ML_Models'
Ngram_options = ['Nograms', 'Bigrams', 'Trigrams', 'Quadgrams']
MaxDepth = [10,20,30,40,50,60,70,80,90,100]

df_prediction = make_predictions_decisionTree(Target_dir, 
                                            Depth = depth, 
                                            Ngram = 'Nograms',
                                            Iterable = False, 
                                            Single_File = 'DocketSheet_WordMatches_TopWords_Bigrams_CalculationI_homebrew_STDV_Top15_highest_STDV.xlsx.xlsx',
                                            Metric = 'Accuracy',
                                            Write2Excel = False, 
                                            Destination_location = Destination)
    

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Bigrams_CalculationI_homebrew_STDV_Top15_highest_STDV.xlsx.xlsx 



In [19]:
# WORKING WITH MULTIPLE FILES IN A DIRECTORY

In [11]:
Target_dir = r'/home/ccirelli2/Desktop/DocketsheetDistResults'
Destination = r'/home/ccirelli2/Desktop/Docket-Sheet-Classification/Results_ML_Models'

df_prediction = make_predictions_decisionTree(Target_dir, 
                                            Depth = 35,
                                            KeyWord = 'Nograms', 
                                            Iterable = True, 
                                            Metric = 'Accuracy',
                                            Write2Excel = False, 
                                            Destination_location = Destination, 
                                            TestTrain = 'Test')



Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationII_AVG_not_zero_Top15_highest_STDV.xlsx.xlsx 



  'recall', 'true', average, warn_for)


Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationIII_Correlation_Coefficient_Top15_highest_COCOEF.xlsx.xlsx 

Generating prediction for => 
 DocketSheet_WordMatches_TopWords_Nograms_CalculationI_homebrew_STDV_Top15_highest_STDV.xlsx.xlsx 

Results 
                                                     Accuracy_train  \
DocketSheet_WordMatches_TopWords_Nograms_Calcul...            0.99   
DocketSheet_WordMatches_TopWords_Nograms_Calcul...            0.98   
DocketSheet_WordMatches_TopWords_Nograms_Calcul...            0.98   

                                                    Accuracy_test  
DocketSheet_WordMatches_TopWords_Nograms_Calcul...           0.86  
DocketSheet_WordMatches_TopWords_Nograms_Calcul...           0.85  
DocketSheet_WordMatches_TopWords_Nograms_Calcul...           0.83  


  'precision', 'predicted', average, warn_for)


In [14]:
'''OBSERVATIONS

Feature_selection:      A depth of 30 appears to provide the best results. 
Precision & Recall:     The best so far as been .89 and .88. 
Stages:                 Stage 4 and 8 continue to perform the worst. 
Stage4:                 Incorrectly predicts stages 7 and 8.
                        Overlap:  only 1 word overlaps Stages 4 and 7 'Civil'. 
Stage8:                 Incorrectly predicts 4 and 5. 
                        Overlap:  There is significant overlap between 8 and 4. Three words overlap in Word Group 2, which 
                        is where the AVG is between 1-2% and CV is the highest. Two words overlap in Word Group 3, which 
                        is our contrarian word group. 

Overall                 VAR is not a good measurement for identifying the top 5 words for our stage 1 and 2 as it removes
                        the sign from our frequencies such that a word with a large negative deviation could get put into
                        this group.  If we were only grabbing the top 15 words using VAT and or STDV we would probably be 
                        ok.  
                        AVG of all other time periods runs into issues as many of the columns may have a 0% but others 
                        could have a frequency at or higher than our target time period.  Therefore, we might think about
                        eliminating the columns with 0% when calculating our average.  This would force the code to 
                        recognize a higher average for the other time periods. 

Thoughts:               1.) Amend the Top5 selection code to calculate the AVG not using any of the Stages with 0%. 
                        2.) Revert back to the old calculation which was ((Target/Avg)*Target) as it is sign sensitive    
'''

"OBSERVATIONS\n\nFeature_selection:      A depth of 30 appears to provide the best results. \nPrecision & Recall:     The best so far as been .89 and .88. \nStages:                 Stage 4 and 8 continue to perform the worst. \nStage4:                 Incorrectly predicts stages 7 and 8.\n                        Overlap:  only 1 word overlaps Stages 4 and 7 'Civil'. \nStage8:                 Incorrectly predicts 4 and 5. \n                        Overlap:  There is significant overlap between 8 and 4. Three words overlap in Word Group 2, which \n                        is where the AVG is between 1-2% and CV is the highest. Two words overlap in Word Group 3, which \n                        is our contrarian word group. \n\nOverall                 VAR is not a good measurement for identifying the top 5 words for our stage 1 and 2 as it removes\n                        the sign from our frequencies such that a word with a large negative deviation could get put into\n                     