## N-gram model to form a probability distribution of words occuring within the different categories 

In [0]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk 
import pickle
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:

def clean_data(df):
    # Get rid of documents where the text is null
    indices_notNull = df.index[pd.notnull(df['document_text'])]
    df = df.loc[indices_notNull]  # Pandas return 1 indexing, so use loc instead of iloc

    # Delete rows with no incident date
    indices_notNull = df.index[pd.notnull(df['incident_date'])]
    df = df.loc[indices_notNull]

    # Delete rows with null entries in categories

    # Divide document into two sections, before 2015 and 2015 onwards
    df['incident_date'] = pd.to_datetime(df['incident_date'])
    df['incident_date'] = pd.DatetimeIndex(df['incident_date']).year
    df_b2015 = df[df['incident_date'] < 2015]
    df_a2015 = df[df['incident_date'] >= 2015]

    return df_b2015, df_a2015



In [0]:

def divide_by_category(df):
    '''
    This function will divide the data into 11 categories by storing their indices
    :param df: The document tags
    :return:
    '''
    columns_of_interest = [
        'nudity_penetration', 'sexual_harassment_remarks',
       'sexual_humiliation_extortion_or_sex_work', 'tasers', 'trespass',
       'racial_slurs', 'planting_drugs_guns', 'neglect_of_duty',
       'refuse_medical_assistance', 'irrational_aggressive_unstable',
       'searching_arresting_minors']


    category_map = {}
    # Initialize the dictionary with column names
    for col in df.columns:
        if col in columns_of_interest:
            indices = df.index[pd.notnull(df[col])]
            indices_true = df.index[df[col] == True]
            print("Number of labelled documents for {} are {}".format(col, len(indices)))
            print("Number of true labels for {} are {}".format(col, len(indices_true)))
            category_map[col] = indices_true

    return category_map


In [0]:

def generate_words(doc_text):
    stop_words = set(stopwords.words('english'))
    tokenized_words = word_tokenize(doc_text)

    key_words = ['narrative', 'alleged', 'allege ', 'allogos']

    words = [word.lower() for word in tokenized_words if word.lower() not in stop_words and word.isalpha()]
    total_words = len(words)
    # Perform a sliding window across words near the key words
    final_words = []
    window_size = 50
    for word in key_words:
        if word in words:
            start_index = words.index(word)+1  # We need the word after the key word, hence +1

            # Add next n words to final_words
            if start_index + window_size < total_words:
                temp = words[start_index: start_index + window_size]
            else:
                temp = words[start_index: ]

            final_words.extend(temp)

    if final_words:
        # Take only the unique words
        final_words = list(set(final_words))
        adjacentWords = zip(final_words[0:], final_words[1:], final_words[2:])

    return final_words


In [0]:

def generate_prob_dict(df):
    '''
    This function will be called again and again for every category
    So we will be generating 11 dictionaries in the end.
    :param df:
    :return:
    '''
    word_dict = {}
    prob_dict = {}
    counter = 0
    for text in df['document_text']:
        if counter % 5 == 0:
            print("Processed ", counter, "records")
        final_words = generate_words(doc_text=text)
        if final_words:
            adjacentWords = zip(final_words[0: ], final_words[1: ])

            for cur, next_word in adjacentWords:
                # print(cur, next_word)
                # The cartesian product results in tuples having same words. So, ignore those.
                if cur != next_word:
                    # Format current_word = {next_word_word: number_of_occurrences}

                    # If the current word(first) doesn't exist in word dist, then create an entry
                    if cur not in word_dict:
                        word_dict[cur] = {next_word: 1}

                    # If the current word exists but not the adjacent, then create an entry for it
                    elif next_word not in word_dict[cur]:
                        word_dict[cur][next_word] = 1

                    # If word and next_word word exists, increase the count
                    else:
                        word_dict[cur][next_word] += 1

        counter += 1

    # Create the probability distribution from the count
    for cur, next_words in word_dict.items():
        prob_dict[cur] = {}
        total_count = sum(next_words.values())
        for w in next_words:
            prob = next_words[w] / total_count
            prob_dict[cur][w] = prob

    print("Length of probability dictionary", len(prob_dict))

    return prob_dict



In [0]:
# Save the document data in a csv file
# load_all_tags()
# Load the saved data
df_doc = pd.read_csv('https://raw.githubusercontent.com/Omkar-Ranadive/CS496-DSS/master/data/document_tags.csv')
df_b2015, df_a2015 = clean_data(df_doc)
print("Before 2015")
cmap_b2015 = divide_by_category(df_b2015)
# print(cmap_b2015)
print("After 2015")
cmap_a2015 = divide_by_category(df_a2015)
models_b2015 = []
models_a2015 = []
model_names = [
      'nudity_penetration', 'sexual_harassment_remarks',
      'sexual_humiliation_extortion_or_sex_work', 'tasers', 'trespass',
      'racial_slurs', 'planting_drugs_guns', 'neglect_of_duty',
      'refuse_medical_assistance', 'irrational_aggressive_unstable',
      'searching_arresting_minors']
# probd = generate_prob_dict(df_b2015.loc[cmap_b2015['nudity_penetration']])
# Generate the probability distributions for each class before 2015
print("--Before 2015--")
for key in model_names:
    print("Generating distribution for: ", key)
    prob_dict = generate_prob_dict(df_b2015.loc[cmap_b2015[key]])
    models_b2015.append(prob_dict)

print("---After 2015---")
# Do the same for data after 2015
for key in model_names:
    print("Generating distribution for: ", key)
    prob_dict = generate_prob_dict(df_a2015.loc[cmap_a2015[key]])
    models_a2015.append(prob_dict)


Before 2015
Number of labelled documents for nudity_penetration are 296
Number of true labels for nudity_penetration are 3
Number of labelled documents for sexual_harassment_remarks are 296
Number of true labels for sexual_harassment_remarks are 4
Number of labelled documents for sexual_humiliation_extortion_or_sex_work are 296
Number of true labels for sexual_humiliation_extortion_or_sex_work are 4
Number of labelled documents for tasers are 297
Number of true labels for tasers are 114
Number of labelled documents for trespass are 296
Number of true labels for trespass are 57
Number of labelled documents for racial_slurs are 296
Number of true labels for racial_slurs are 25
Number of labelled documents for planting_drugs_guns are 296
Number of true labels for planting_drugs_guns are 24
Number of labelled documents for neglect_of_duty are 297
Number of true labels for neglect_of_duty are 14
Number of labelled documents for refuse_medical_assistance are 296
Number of true labels for ref

Now, the results of the different probability distributions can be viewed below. Type the numbers as asked in the prompt. 

In [0]:
 while True:
        print("Select years: ")
        print("1. Before 2015")
        print("2. From 2015")
        print("3. Press 3 to terminate")
        year = int(input())
        if year == 3: 
          break 
        print("Select model: ")
        for index, name in enumerate(model_names):
            print('{}. {}'.format(index, name))

        model_num = int(input())
        if year == 1:
            prob_dist = models_b2015[model_num]
        else:
            prob_dist = models_a2015[model_num]
        
        for key, value in prob_dist.items():
          print(key, value)


Select years: 
1. Before 2015
2. From 2015
3. Press 3 to terminate
1
Select model: 
0. nudity_penetration
1. sexual_harassment_remarks
2. sexual_humiliation_extortion_or_sex_work
3. tasers
4. trespass
5. racial_slurs
6. planting_drugs_guns
7. neglect_of_duty
8. refuse_medical_assistance
9. irrational_aggressive_unstable
10. searching_arresting_minors
0
cer {'approximately': 1.0}
approximately {'observed': 0.5, 'drugs': 0.5}
observed {'page': 1.0}
page {'three': 1.0}
three {'police': 1.0}
police {'ground': 0.5, 'utreras': 0.5}
ground {'around': 1.0}
around {'standards': 1.0}
standards {'summary': 1.0}
summary {'custody': 1.0}
custody {'iwas': 1.0}
iwas {'station': 1.0}
station {'rizzi': 1.0}
rizzi {'statement': 1.0}
statement {'raymond': 1.0}
raymond {'report': 1.0}
report {'beat': 1.0}
beat {'unit': 0.5, 'tified': 0.5}
unit {'powder': 1.0}
powder {'arrest': 1.0}
arrest {'taken': 1.0}
taken {'november': 1.0}
november {'leached': 1.0}
leached {'suspect': 1.0}
suspect {'searched': 1.0}
se