In [1]:
'''DOCUMENTATION

Purpose:        The purpose of the get_Ngram function is to obtain the Ngram frequency distribution for each of the predefined 
                stages of the docketseet document. 
           
User Options    Ngrams:       The user may chose from 4 types of Ngrams 'Nograms', 'Bigrams', 'Trigrams', and 'Quadgrams'. 
                Calculation:  In addition, the user may chose to calculate the absolute frequency of the Ngram, which is 
                              the number of times that word appeared in the stage, or, the frequency of the Ngram divided 
                              by the number of rows in that stage, providing a calculation for how often on average 
                              the Ngram appears in each row of the stage. 

Output:         A dataframe whos index is comprised of the Ngrams, columns the ll stages and content a combination of either
                of the two aforementioned user options. 
'''

"DOCUMENTATION\n\nPurpose:        The purpose of the get_Ngram function is to obtain the Ngram frequency distribution for each of the predefined \n                stages of the docketseet document. \n           \nUser Options    Ngrams:       The user may chose from 4 types of Ngrams 'Nograms', 'Bigrams', 'Trigrams', and 'Quadgrams'. \n                Calculation:  In addition, the user may chose to calculate the absolute frequency of the Ngram, which is \n                              the number of times that word appeared in the stage, or, the frequency of the Ngram divided \n                              by the number of rows in that stage, providing a calculation for how often on average \n                              the Ngram appears in each row of the stage. \n\nOutput:         A dataframe whos index is comprised of the Ngrams, columns the ll stages and content a combination of either\n                of the two aforementioned user options. \n"

In [2]:
# IMPORT LIBRARIES

In [3]:
import os
import re
import nltk
import pandas as pd
import string

In [4]:
# IMPORT MODULES

In [5]:
os.chdir('/home/ccirelli2/Desktop/Docket-Sheet-Classification-v2/Modules')
import Step1_Module_Ngrams_FreqDist_version4_Ngrams as stp1_Ngrams

In [6]:
# IMPORT DOCKET SHEET WITH PRE-CLASSIFIED TIME PERIODS

In [7]:
Docket_sheet_file = r'/home/ccirelli2/Desktop/Docket-Sheet-Classification-DataFiles/Docketsheet_Data_04_11_18/Preclassified Data/Pre-Classified_TimePeriods.xlsx'
df = pd.read_excel(Docket_sheet_file)
print(df.columns)

Index(['Case Number', 'Activity Date', 'Activity Number', 'docket_text',
       'Index ', 'Relevant', 'Time_Period', 'Filler'],
      dtype='object')


In [8]:
def get_Ngram_freq_dist_by_stage(Docket_sheet, Ngram_type = 'Nograms', 
                                 Calculation_type = 'Average_appearance', 
                                To_excel = False, Location = None):
    
    
    
    # Import the docketsheet as a dataframe and reshape. 
    df_Master_DocketSheet_File = stp1_Ngrams.import_docket_sheet_file(Docket_sheet)
    
    # Create a set object of the different time periods (stages) of the lawsuits. 
    Stages = set(df_Master_DocketSheet_File['Time_Period'])
    
    # Create a Dataframe to house our Freq Dist 
    df_Freq_Dist = ''
    
    # Iterate over each stage in the Docket_sheet
    for stage in Stages:
        
        # Print Progress
        print('Creating the frequency distribution for stage =>', stage, '\n')
                
        # Limit the DataFrame by each stage in succession so as to capture only those rows of the docketsheet tha 
        delimiter = df_Master_DocketSheet_File['Time_Period'] == stage
        df_limited = df_Master_DocketSheet_File[delimiter]
        
        # Dictonary to Capture Ngram Freq by Stage
        Ngram_dictionary = {}
        
        # Count rows to serve as the denominator for our freq dist for each stage. 
        Count_rows = 0
        
        # Identify only the text of each row. 
        for row in df_limited.itertuples():
            
            # Count rows
            Count_rows += 1
            
            # Get Text
            text = row[4]
            
            # Clean & Tokenize the text
            clean_tokenized_text = stp1_Ngrams.clean_andTokenize_text(text)
            
            # Get Ngrams
            Ngrams = stp1_Ngrams.get_Ngrams(clean_tokenized_text, Ngram_type)
            
            # Loop over Ngrams
            for ngram in Ngrams:
                Ngram_dictionary[ngram] = Ngram_dictionary.get(ngram, 0) + 1
        
        # If we have not yet created the Stage 1 frequency distribution
        if stage < 2:
            # Create dataframe based on the calculation type chosen. 
            if Calculation_type == 'Average_appearance':
                df = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                # If average appearance, devide the frequency by the count of rows for the stage in question. 
                df_avg_appearance = df / Count_rows
                df_Freq_Dist = df_avg_appearance
            
            # If not 'Average appearance, simple use the frequency of the Ngram.  
            elif Calculation_type == 'Frequency_distribution':
                df_Freq_Dist = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                
        # If Stage 1 has already been created, then we will want to merge the remainder of the dataframes stages to df1.  
        else:
            if Calculation_type == 'Average_appearance':
                df = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                df_avg_appearance = df / Count_rows
                df_Freq_Dist = df_Freq_Dist.merge(df_avg_appearance, how = 'outer', left_index = True, right_index = True)
            elif Calculation_type == 'Frequency_distribution':
                df = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                df_Freq_Dist = df_Freq_Dist.merge(df, how = 'outer', left_index = True, right_index = True)
 
    # Transform Dataframe - Create Ngram Column
    df_transform = stp1_Ngrams.create_Ngram_column(df_Freq_Dist, Ngram_type)

    # Replace Nan values with 0
    
    df_final = df_transform.fillna(value = 0)
    
    # Write to Excel
    if To_excel == True:
        print('Writing dataframe to Excel')
        os.chdir(Location)
        File_name = str('Docketsheet_FreqDist' + '_' + Ngram_type + '_' + Calculation_type)
        stp1_Ngrams.write_to_excel(df_final, Location, File_name)
        print('Your file has been saved to:  ', Location)
    # Once the list of ngrams is complete, return it to the user.
    
    return df_final
            
    

In [9]:
location = r'/home/ccirelli2/Desktop/Docket-Sheet-Classification-DataFiles/Docketsheet_Data_04_11_18/Ngram_FreqDist_DataFiles'


In [17]:
Ngram_freq_dist = get_Ngram_freq_dist_by_stage(Docket_sheet_file, 
                                               Ngram_type = 'Quadgrams', 
                                               #Calculation_type = 'Frequency_distribution', 
                                               To_excel = True, 
                                               Location = location)

Creating the frequency distribution for stage => 1 

Creating the frequency distribution for stage => 2 

Creating the frequency distribution for stage => 3 

Creating the frequency distribution for stage => 4 

Creating the frequency distribution for stage => 5 

Creating the frequency distribution for stage => 6 

Creating the frequency distribution for stage => 7 

Creating the frequency distribution for stage => 8 

Creating the frequency distribution for stage => 9 

Creating the frequency distribution for stage => 10 

Creating the frequency distribution for stage => 11 

Writing dataframe to Excel
Your file has been saved to:   /home/ccirelli2/Desktop/Docket-Sheet-Classification-DataFiles/Docketsheet_Data_04_11_18/Ngram_FreqDist_DataFiles
