## NLP on Flowers for Algernon


Information
<ul>
    <li>https://en.wikipedia.org/wiki/Flowers_for_Algernon</li>
</ul>

<hr>

### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import datetime

from spellchecker import SpellChecker
spell = SpellChecker()

import textstat


# pd.set_option('display.max_rows', 500)
# nltk.download()

### Manually created list of valid words

In [2]:
# corresponds to the progress report x
valid_words_manual = dict()

valid_words_manual[1] = []
valid_words_manual[2] = ['inkblot']
valid_words_manual[3] = ['hollers']
valid_words_manual[4] = ['apperception']
valid_words_manual[5] = []
valid_words_manual[6] = []
valid_words_manual[7] = []
valid_words_manual[8] = []
valid_words_manual[9]= ['backfired', 'commas', 'photostated', 'ditched', 'halloran', 'scarves', 'twitches', 'whiter', 'valentines', 'valentine', 'dimples', 'bunches', 'teases', 'scribbles', 'caretakers', 'wf', 'ddf', 'sf', 'obj', 'inkblots']
valid_words_manual[10] = ['rubs','cringes', 'clumping', 'sta', 'bakers', 'kneads', 'smooths', 'snorts', 'rechecked', 'hallston', 'neurosurgeons', 'excites', 'dostoevski', 'flaubert', 'teardrops', 'pecking', 'untwists', 'shrieks', 'cowers', 'overwhelms', 'spanked', 'dirties']
valid_words_manual[11] = ['toyed', 'highlighted', 'swirls', 'frightens', 'outstretched', 'rubs', 'amuses', 'spanked', 'éclairs', 'éclairs', 'undercharging', 'natured', 'deliveries', 'wavered', 'shhhh', 'mazes', 'undercharging', 'etymologies', 'blacklisting', 'ii', 'wessey', 'ii', 'trobriand', 'debussy', 'browned', 'dopes', 'cheeked', 'howells', 'swishes', 'writhes', 'forgets', 'cheeked']
valid_words_manual[12] = ['cupids', 'blurts', 'shrieks', 'glares', 'shrieks', 'fades', 'dorni', 'wheelchair', 'smartest', 'wheelchair', 'francine', 'trainees', 'variants', 'boulean', 'dawned', 'widening', 'splintering', 'simmering', 'zagging', 'outcropping', 'crawlers']
valid_words_manual[13] = ['strato', 'uhm', 'vibrations', 'vibrations', 'twitches', 'ushers', 'glares', 'pursing', 'ushering', 'psychosubstantiation', 'pats', 'cringes', 'cringes', 'shivers', 'dirties', 'shortwave', 'encephalo', 'smartest', 'flushes', 'pats', 'squeezes', 'donkeys', 'forgets', 'frightens', 'pats', 'whimpers', 'quarrelled', 'phenylketonuria', 'injections', 'revitalized', 'tanida', 'psychopathology', 'tanida', 'propounded', 'outshines', 'dumbfounded', 'tanida', 'petrology', 'banach', 'manifolds', 'motorized', 'mugged', 'jung', 'phoniness', 'euphemisms', 'handshakers', 'nodders', 'smilers', 'standees', 'zellerman', 'worfel', 'resenting', 'gawking', 'scurrying', 'minted', 'scampered', 'experimentalists', 'xiv', 'washbasins', 'midtown', 'shoebox']
valid_words_manual[14] = ['legwork', 'penetrates', 'sinks', 'surrounds', 'baghdad', 'taping', 'mealtimes', 'gateleg', 'pretzels', 'shambles', 'oozing', 'overstuffed', 'underthings', 'flinging', 'burbled', 'burglaries', 'tresses', 'uptilted', 'dilettantes', 'dilettantes', 'dolled', 'stardust', 'coffins', 'junkmobiles', 'cliché', 'mazes', 'curtsied', 'gordons', 'phinney', 'meiner', 'cheeked', 'barbershops', 'apologizing', 'wakens', 'shaves', 'stardust', 'stardust', 'cooed', 'yawned', 'weirdest', 'weirdest', 'dimwits', 'immersing', 'westerns', 'whistles', 'catcalls', 'mazel', 'tov', 'dishware', 'cowered', 'rowdier', 'abusing', 'naïveté', 'cheeked', 'crosstown', 'outstretched', 'toyed', 'brassière', 'underthings', 'landsdoff', 'photoeffect', 'befriended', 'stardust', 'recorders', 'undermines']
valid_words_manual[15] = ['disposing', 'pieced', 'injections', 'psycho']
valid_words_manual[16] = ['morons', 'cuddling', 'jangled', 'thursdays', 'lautrec', 'retardates', 'wheelchair', 'cuddling', 'mutes', 'laundries', 'bakeries', 'untidies', 'stardust', 'waltzing', 'sunbathing', 'cabarets', 'cheee', 'cheeeee', 'urinating', 'defecating', 'cheeee', 'cheeeee', 'cheeeee', 'stardust', 'panicking', 'fades', 'hyram', 'hyram', 'raynor', 'raynor', 'raynor', 'raynor', 'retardates', 'raynor', 'raynors', 'raynor', 'wiggles', 'shhhh', 'shhhh', 'spluttered', 'likeable', 'gordons', 'effacing', 'slurred', 'novocaine', 'regressed', 'syndromes', 'rechecked', 'hallston', 'undrapes', 'radioisotopes', 'convolutions', 'fissures', 'couldn', 'shhhh', 'gashed', 'fingermarks', 'coolies', 'lampshades', 'fussed', 'spanked', 'duncecap', 'morons', 'raskin', 'lampshades', 'platitudes', 'passageways', 'burdening', 'barked']
valid_words_manual[17] = ['abusing', 'encompassing', 'multipetaled', 'rainbows', 'multipetaled', 'retyped', 'mazes', 'fugues', 'humoring', 'humoring', 'kiddie', 'unclosing', 'reclaiming', 'über', 'psychische', 'ganzheit', 'mooney', 'windmills', 'sorcerers', 'windowshade', 'nov']

more_valid_words = []
for k,v in valid_words_manual.items():
    for word in v:
        more_valid_words.append(word)



In [3]:
words_words = nltk.corpus.words.words()
words_words_lower = set([i.lower() for i in words_words])

brown_words = nltk.corpus.brown.words()
brown_words_lower = set([i.lower() for i in brown_words])

valid_words = ['', 've', 'll','charlie', 'harold', 'nemur', 'alice', 'kinnian', 'algernon', 'guarino', 'beekman', 
               'welberg', 'hymie', 'rahajamati', 'jayson', 'strauss', 'burt', 'selden', 'fay', 'lillman',
              'herman', 'donner', 'frank,', 'reilly','joe', 'carp', 'gimpy', 'fanny', 'birden',
              'hilda', 'minnie', 'meyer', 'klaus', 'norma', 'matt', 'gordon', 'krueger',
              ]

valid_words.extend(more_valid_words)

all_lower_words = set.union(*[words_words_lower, brown_words_lower, valid_words])

In [4]:
print(len(words_words_lower))
print(len(brown_words_lower))
print(len(set(valid_words)))
print(len(all_lower_words))

234377
49815
294
261822


In [5]:
# nltk.word_tokenize("I couldn't stay at hadn't doesn't Dr. Neuman the party.".lower())

# print('don\'t' in all_lower_words)
# print('wasn\'t' in all_lower_words)
# print('shouldn\'t' in all_lower_words)
# print('rite' in words_words_lower)
# print('faled' in words_words_lower)
# print('smartest' in words_words_lower)
# print("prof" in words_words_lower)

### Read file

In [6]:
with open(r'Archived Files/flowers-for-algernon-2005.txt', encoding='utf-8') as reader:
    data = reader.read()

### Check if all chapters are in the text file

In [7]:
# List of the names of the chapters in the book; some chapter names are truncated
chapters = ['Progris riport 1', 'Progris riport 2', '3d progris riport',
            'Progris riport 4', 'Progris riport 5', 'Progris riport 6',
            'PROGRESS REPORT 7', 'PROGRESS REPORT 8', 'PROGRESS REPORT 9',
            'PROGRESS REPORT 10', 'PROGRESS REPORT 11', 'PROGRESS REPORT 12',
            'PROGRESS REPORT 13', 'PROGRESS REPORT 14', 'PROGRESS REPORT 15',
            'PROGRESS REPORT 16', 'PROGRESS REPORT 17']
chapters = [i.lower() for i in chapters]

# Note this is not 0-indexed (nor should it be, it's a dict)
chapters_dict = dict()
for chapter_index in range(0, len(chapters)):
    chapters_dict[chapter_index+1]= chapters[chapter_index]

# Check if all chapters are in the book
for chapter in chapters:
    current_count = (data.lower()).count(chapter)
    assert current_count == 1

### Place each chapter's text (and relevant info) into a list

In [8]:
list_of_chapters = []

for chapter in chapters_dict:
    
    start_chapter_index = (data.lower()).find(chapters_dict[chapter])
    
    if(chapter==len(chapters_dict)):
        end_chapter_index = len(data)
    else:
        end_chapter_index = (data.lower()).find(chapters_dict[chapter+1])
    
    chapter_number = chapter    
    chapter_name = chapters_dict[chapter]
    chapter_text = data[start_chapter_index:end_chapter_index]
    
    list_of_chapters.append((chapter_number, chapter_name, chapter_text))

### Define a custom tokenization function

In [9]:
## Make this a class and use inheritence/polymorphism

pattern = r'''[.|,|\-|\'|—|_|*|?|\\"|:|;|^|$|#|@|!|(|)|+|=|%|&|(\d)+]+'''
def custom_token(some_list):
    """
    Further tokenization of the text
    """
    new_list = []
    correct_abbreviations = ['dr.', 'mr.', 'mrs.', 'miss.', 'prof.']
    for word in some_list:
        match = re.search(pattern, word)
        if(match) and (word not in correct_abbreviations):
            split_list = re.sub(pattern, " ", word).strip().split(' ')
            new_list.extend(split_list)
        else:
            new_list.append(word)
    #remove empty tokens
    new_list = list(filter(None, new_list))
    
    return new_list

def custom_token_2(some_token):
    """
    Further tokenization of the a single token
    """
    correct_abbreviations = ['dr.', 'mr.', 'mrs.', 'miss.', 'prof.']

    match = re.search(pattern, some_token)
    if(match) and (some_token not in correct_abbreviations):
        split_list = re.sub(pattern, " ", some_token).strip().split(' ')
        split_list = list(filter(None, split_list))
    else:
        split_list = [some_token]
    
    return split_list

### Tokenize the words in each chapter

In [10]:
tokenized_list_of_chapters = []

for chapter in list_of_chapters:
    chapter_number = chapter[0]
    chapter_name = chapter[1]
    chapter_tokens_original = nltk.word_tokenize(chapter[2])
    chapter_tokens = nltk.word_tokenize(chapter[2].lower())
    chapter_tokens = custom_token(chapter_tokens)
    
    tokenized_list_of_chapters.append((chapter_number, chapter_name, chapter_tokens, chapter_tokens_original))

### Function to get sentence surrounding a word

In [11]:
def get_sentence(token_index: int, chapter_tokens_original: list, offset: int=5)-> str:
    # get 11 words ahead
    if(token_index-offset<0):
        start_index=0
    else:
        start_index = token_index-offset+1
        
    return " ".join(chapter_tokens_original[start_index:token_index+5])

### Get the surrounding sentence to each word; Tokenize all words (each row is a token)

In [12]:
all_words_tokenized = []
offset = 5
word_pos = 0

for chapter_info in tokenized_list_of_chapters:
    chapter_num = chapter_info[0]
    chapter_name = chapter_info[1]
    chapter_tokens = chapter_info[2]
    chapter_tokens_original = chapter_info[3]
    
    for token_index in range(0, len(chapter_tokens_original)):
        if(token_index > len(chapter_tokens_original) + offset):
            break

        sentence = get_sentence(token_index, chapter_tokens_original, offset)

        all_words_tokenized.append([chapter_num, chapter_name, word_pos, chapter_tokens_original[token_index], sentence])
        
        word_pos += 1



### Tokenize the words again (using the custom function)

In [13]:
final_list_tokenized = list()

for token_info in all_words_tokenized:
    
    chapter_number = token_info[0]
    chapter_name = token_info[1]
    word_pos = token_info[2]
    original_token = token_info[3]
    sentence = token_info[4]
    
    custom_tokens = custom_token_2(original_token.lower())
    
    for new_token in custom_tokens:
        final_list_tokenized.append((chapter_number, chapter_name, word_pos, new_token, sentence))

### Filter the final list of all words tokenized

In [14]:
# Sort based on word position
# sorted(final_list_tokenized, key=lambda x: x[2])

final_list_tokenized_with_filters = list()

for token_info in final_list_tokenized:
    word = token_info[3]
    if word not in all_lower_words:
        word_misspelled = True
    else:
        word_misspelled = False
    
    final_list_tokenized_with_filters.append((*token_info, word_misspelled))
    
final_list_tokenized_with_filters.sort(key=lambda x: x[2])


### Place into DataFrame

In [15]:
df_final = pd.DataFrame(final_list_tokenized_with_filters,
                       columns=['Chapter Number', 'Chapter Name', 'Word Position', 'Token', 'Sentence', 'Misspelled'])
df_final['Word Position'] = np.arange(1, len(df_final)+1)
df_final['Word Position in Chapter'] = df_final.groupby('Chapter Number')['Chapter Number'].rank(axis=1, method='first').astype(int)
df_final

Unnamed: 0,Chapter Number,Chapter Name,Word Position,Token,Sentence,Misspelled,Word Position in Chapter
0,1,progris riport 1,1,progris,Progris riport 1 martch 3,True,1
1,1,progris riport 1,2,riport,Progris riport 1 martch 3 Dr,True,2
2,1,progris riport 1,3,martch,Progris riport 1 martch 3 Dr Strauss says,True,3
3,1,progris riport 1,4,dr,riport 1 martch 3 Dr Strauss says I shoud,True,4
4,1,progris riport 1,5,strauss,1 martch 3 Dr Strauss says I shoud rite,False,5
...,...,...,...,...,...,...,...
89615,17,progress report 17,89616,grave,some flowrs on Algernons grave in the bak yard,False,9757
89616,17,progress report 17,89617,in,flowrs on Algernons grave in the bak yard .,False,9758
89617,17,progress report 17,89618,the,on Algernons grave in the bak yard .,False,9759
89618,17,progress report 17,89619,bak,Algernons grave in the bak yard .,True,9760


In [16]:
def get_num_of_words_chapter(chapter_number: int) -> int:
    """
    
    """
    return len(df_final[df_final['Chapter Number']==chapter_number])


def get_num_of_misspellings(chapter_number: int) -> int:
    """
    
    """
    return len(df_final[(df_final['Chapter Number']==chapter_number) & (df_final['Misspelled']==True)])
        

### Generator function to manually create valid words list

In [17]:
def get_chapter_generator():
    """
    Generator function that yields chapter number
    and mispellings in that chapter
    
    Created to manually check mispellings
    """
    for chapter_number, chapter_name in chapters_dict.items():
        misspellings = list(set(df_final[(df_final['Chapter Number']==chapter_number) & (df_final['Misspelled']==True)]['Token']))
        
        yield ((chapter_number, chapter_name), misspellings)

In [18]:
my_gen = get_chapter_generator()

In [19]:
x = next(my_gen)
print(x[0])
print(x[1])

(1, 'progris riport 1')
['perfesser', 'yrs', 'tolld', 'membir', 'evrey', 'shoud', 'compushishens', 'importint', 'beekmin', 'collidge', 'dr', 'riport', 'evrything', 'dollers', 'martch', 'yeres', 'lern', 'becaus', 'munth', 'werk', 'donners', 'kinnians', 'mabye', 'shud', 'happins', 'progris', 'anymor', 'brithday', 'retarted']


### Do a sentiment analysis on each chapter

In [20]:
def get_nrc_data():
    nrc = 'Data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt'
    count=0
    emotion_dict=dict()
    with open(nrc,'r') as f:
        all_lines = list()
        for line in f:
            if count < 46:
                count+=1
                continue
            line = line.strip().split('\t')
            if int(line[2]) == 1:
                if emotion_dict.get(line[0]):
                    emotion_dict[line[0]].append(line[1])
                else:
                    emotion_dict[line[0]] = [line[1]]
    return emotion_dict

emotion_dict = get_nrc_data()

def emotion_analyzer(text, emotion_dict=emotion_dict):
    #Set up the result dictionary
    emotions = {x for y in emotion_dict.values() for x in y}
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0

    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/len(text.split())
    return emotion_count

### Get chapter level summary

In [21]:
data_chapter_level = []

for chapter in list_of_chapters:
    chapter_number = chapter[0]
    text = str(chapter[2].replace('\n', ''))
    fre = textstat.flesch_reading_ease(text)
    fkg = textstat.flesch_kincaid_grade(text)
    si = textstat.smog_index(text)
    cli = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)
    dcrs = textstat.dale_chall_readability_score(text)    

    lwf = textstat.linsear_write_formula(text)
    gf = textstat.gunning_fog(text)
    ts = textstat.text_standard(text)

    sentence_count = textstat.sentence_count(text)
    
    num_of_words = get_num_of_words_chapter(chapter_number)
    misspellings = get_num_of_misspellings(chapter_number)
    
    data_chapter_level.append((chapter_number, num_of_words, sentence_count, misspellings,
                              fre, fkg, si, cli, ari, dcrs, lwf, gf, ts))
    
df_chapters = pd.DataFrame(data_chapter_level, columns=['Chapter Number', 'Number of Words', 'Number of Sentences', 'Misspelled Words',
                                                        'Flesch Reading Ease', 'Flesch Grade', 'Smog Index', 
                                                        'Coleman Liau Readability', 'Automated Readability Index',
                                                        'Dale-Chall Readability', 'Linsear Write Formula',
                                                        'Gunning Fog', 'Text Standard'])

In [22]:
def apply_spell_correct(token: str, misspelled: bool) -> str:
    """
    
    """
    if(misspelled):
        return spell.correction(token)
    else:
        return token

In [None]:
df_final['Correction'] = df_final.apply(lambda x: apply_spell_correct(x['Token'], x['Misspelled']), axis=1)

In [None]:
# correct spellings
list_of_chapters_text = []
for chapter_number, chapter_name in chapters_dict.items():
    chapter_text = " ".join(list(df_final[(df_final['Chapter Number']==chapter_number)]['Correction']))
    list_of_chapters_text.append((chapter_number, chapter_name, chapter_text))


list_to_df_emotions = []
for chapter_info in list_of_chapters_text:
    list_to_df_emotions.append(emotion_analyzer(chapter_info[2]))
    
df_emotions = pd.DataFrame(list_to_df_emotions)
# add anger, surprise
df_emotions['all_pos']=(df_emotions['trust']+df_emotions['positive']+ df_emotions['joy']+ df_emotions['anticipation'])
df_emotions['all_neg']=(df_emotions['fear']+df_emotions['negative']+ df_emotions['disgust']+ df_emotions['sadness'])
df_emotions['net']=(df_emotions['all_pos']-df_emotions['all_neg'])

In [None]:
df_chapters = pd.concat([df_chapters, df_emotions], axis=1, sort=False)
df_chapters

### Export to csv

In [None]:
df_final.to_csv('Data/Flowers for Algernon - All word tokens.csv')
df_chapters.to_csv('Data/Flowers for Algernon - Chapter summaries.csv')

In [None]:
# .generate
# generate random text using a trigram model
# https://stackoverflow.com/questions/1150144/generating-random-sentences-from-custom-text-in-pythons-nltk