In [1]:
'''DOCUMENTATION

Purpose:        The purpose of the get_Ngram function is to obtain the Ngram frequency distribution for each of the predefined 
                stages of the docketseet document. 
           
User Options    The user may chose from 4 types of Ngrams 'Nograms', 'Bigrams', 'Trigrams', and 'Quadgrams'. 
                In addition, the user may chose to calculate the absolute frequency of the Ngram, which is the number of 
                times that word appeared in the stage, or, the frequency of the Ngram divided by the number of rows in that
                stage, providing a calculation for how often on average the Ngram appears in each row of the stage. 

Output:         A dataframe whos index is comprised of the Ngrams, columns the ll stages and content a combination of either
                of the two aforementioned user options. 
'''

"DOCUMENTATION\n\nPurpose:        The purpose of the get_Ngram function is to obtain the Ngram frequency distribution for each of the predefined \n                stages of the docketseet document. \n           \nUser Options    The user may chose from 4 types of Ngrams 'Nograms', 'Bigrams', 'Trigrams', and 'Quadgrams'. \n                In addition, the user may chose to calculate the absolute frequency of the Ngram, which is the number of \n                times that word appeared in the stage, or, the frequency of the Ngram divided by the number of rows in that\n                stage, providing a calculation for how often on average the Ngram appears in each row of the stage. \n\nOutput:         A dataframe whos index is comprised of the Ngrams, columns the ll stages and content a combination of either\n                of the two aforementioned user options. \n"

In [2]:
# IMPORT LIBRARIES

In [3]:
import os
import re
import nltk
import pandas as pd
import string

In [4]:
# IMPORT MODULES

In [5]:
os.chdir(r'C:\\Users\\Chris.Cirelli\\Desktop\\Python Programming Docs\\GSU\\Sprint Project\\Docket-Sheet-Classification\\Modules')
import Step1_Module_Ngrams_FreqDist as stp1_Ngrams

In [6]:
# IMPORT DOCKET SHEET WITH PRE-CLASSIFIED TIME PERIODS

In [7]:
Docket_sheet_file = r'C:\Users\Chris.Cirelli\Desktop\Python Programming Docs\GSU\Sprint Project\Docket-Sheet-Classification\Data_Files_applicable_all_code\DocketSheet Classification_70_02.22.2018.xlsx'

In [8]:
def get_Ngram_freq_dist_by_stage(Docket_sheet, Ngram_type = 'Nograms', Calculation_type = 'Frequency_distribution', 
                                To_excel = False, Location = None):
    
    ### Also, an option to chose whether you want an absolute freq or the % of the times the Ngram appears for a given stage. 
    
    # Import the docketsheet as a dataframe and reshape. 
    df_Master_DocketSheet_File = stp1_Ngrams.import_docket_sheet_file(Docket_sheet)
    
    # Create a set object of the different time periods (stages) of the lawsuits. 
    Stages = set(df_Master_DocketSheet_File['Time Period'])
    
    # Create a Dataframe to house our Freq Dist 
    df_Freq_Dist = ''
    
    # Iterate over each stage in the Docket_sheet
    for stage in Stages:
        
        # Print Progress
        print('Creating the frequency distribution for stage =>', stage, '\n')
                
        # Limit the DataFrame by each stage in succession so as to capture only those rows of the docketsheet tha 
        delimiter = df_Master_DocketSheet_File['Time Period'] == stage
        df_limited = df_Master_DocketSheet_File[delimiter]
        
        # Dictonary to Capture Ngram Freq by Stage
        Ngram_dictionary = {}
        
        # Count rows to serve as the denominator for our freq dist for each stage. 
        Count_rows = 0
        
        # Identify only the text of each row. 
        for row in df_limited.itertuples():
            
            # Count rows
            Count_rows += 1
            
            # Get Text
            text = row[4]
            
            # Clean & Tokenize the text
            clean_tokenized_text = stp1_Ngrams.clean_andTokenize_text(text)
            
            # Get Ngrams
            Ngrams = stp1_Ngrams.get_Ngrams(clean_tokenized_text, Ngram_type)
            
            # Loop over Ngrams
            for ngram in Ngrams:
                Ngram_dictionary[ngram] = Ngram_dictionary.get(ngram, 0) + 1
        
        # If we have not yet created the Stage 1 frequency distribution
        if stage < 2:
            # Create dataframe based on the calculation type chose. 
            if Calculation_type == 'Average_appearance':
                df = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                # If average appearance, devide the frequency by the count of rows for the stage in question. 
                df_avg_appearance = df / Count_rows
                df_Freq_Dist = df_avg_appearance
            
            # If not 'Average appearance, simple use the frequency of the Ngram.  
            else:
                df_Freq_Dist = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                
        # If Stage 1 has already been created, then we will want to merge the remainder of the dataframes stages to df1.  
        else:
            if Calculation_type == 'Average_appearance':
                df = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                df_avg_appearance = df / Count_rows
                df_Freq_Dist = df_Freq_Dist.merge(df_avg_appearance, how = 'outer', left_index = True, right_index = True)
            else:
                df = pd.DataFrame(Ngram_dictionary, index = [stage]).transpose()
                df_Freq_Dist = df_Freq_Dist.merge(df, how = 'outer', left_index = True, right_index = True)
 
    # Transform Dataframe - Create Ngram Column
    
    df_final = stp1_Ngrams.create_Ngram_column(df_Freq_Dist, Ngram_type)

    # Write to Excel
    if To_excel == True:
        print('Writing dataframe to Excel')
        os.chdir(Location)
        File_name = str('Docketsheet_FreqDist' + '_' + Ngram_type + '_' + Calculation_type)
        stp1_Ngrams.write_to_excel(df_final, Location, File_name)
        print('Your file has been saved to:  ', Location)
    # Once the list of ngrams is complete, return it to the user.
    
    return df_final
            
    

In [9]:
Location = r'C:\Users\Chris.Cirelli\Desktop\Python Programming Docs\GSU\Sprint Project\Docket-Sheet-Classification\Result_Ngrams'


In [14]:
Ngram_freq_dist = get_Ngram_freq_dist_by_stage(Docket_sheet_file, 'Quadgrams', 'Average_appearance', True, Location)

Creating the frequency distribution for stage => 1.0 

Creating the frequency distribution for stage => 2.0 

Creating the frequency distribution for stage => 3.0 

Creating the frequency distribution for stage => 4.0 

Creating the frequency distribution for stage => 5.0 

Creating the frequency distribution for stage => 6.0 

Creating the frequency distribution for stage => 7.0 

Creating the frequency distribution for stage => 8.0 

Creating the frequency distribution for stage => 9.0 

Creating the frequency distribution for stage => 10.0 

Creating the frequency distribution for stage => 11.0 

entered loop
Writing dataframe to Excel
Your file has been saved to:   C:\Users\Chris.Cirelli\Desktop\Python Programming Docs\GSU\Sprint Project\Docket-Sheet-Classification\Result_Ngrams


In [15]:
Ngram_freq_dist.head()

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0
0,"(abacu, corpor, attach, text)",,,,,0.001684,,,,,,
1,"(abacu, corpor, discoveri, end)",,0.008403,,,,,,,,,
2,"(abacu, corpor, first, interrogatori)",,,,,0.003367,,,,,,
3,"(abacu, corpor, lasai, brown)",,,,,0.001684,,,,,,
4,"(accord, frap, usca, mandat)",,,,,,,,,,0.054054,
