## NLP on Flowers for Algernon

<hr>

### Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import datetime
import csv 

from spellchecker import SpellChecker
spell = SpellChecker()

import textstat

# pd.set_option('display.max_rows', 500)
# nltk.download()

### Manually created list of valid words
Read a text file with a list of (additional) valid words

In [4]:
def return_words(file_name):
    """
    Returns a list of words by reading
    a text file.  
    
    Each word should be on a new row in the text
    file.
    
    Parameters
    ----------
    file_name : str
        The file name of the text file to read.
        
    Returns
    -------
    list
        A list of the words.  
    """
    words = []
    
    with open(file_name, 'r') as fp:
    # Read each row of the file
        reader = csv.reader(fp)
        for row in reader:
            words.extend(row)    
    
    return words

In [5]:
# The valid-words'manual.txt contains words that are valid.
more_valid_words = return_words('../Data/valid-words-manual.txt')

### Create list of valid (lowercase) words
From the following:
* Brown corpus
* Words corpus
* The valid words before
* And another manually created list (Nouns)

In [6]:
words_words = nltk.corpus.words.words()
words_words_lower = set([i.lower() for i in words_words])

brown_words = nltk.corpus.brown.words()
brown_words_lower = set([i.lower() for i in brown_words])

# Other valid words - Nouns
valid_words = ['', 've', 'll','charlie', 'harold', 'nemur', 'alice', 'kinnian', 'algernon', 'guarino', 'beekman', 
               'welberg', 'hymie', 'rahajamati', 'jayson', 'strauss', 'burt', 'selden', 'fay', 'lillman',
              'herman', 'donner', 'frank,', 'reilly','joe', 'carp', 'gimpy', 'fanny', 'birden',
              'hilda', 'minnie', 'meyer', 'klaus', 'norma', 'matt', 'gordon', 'krueger',
              ]

valid_words.extend(more_valid_words)

all_lower_words = set.union(*[words_words_lower, brown_words_lower, valid_words])

In [7]:
print(len(words_words_lower))
print(len(brown_words_lower))
print(len(set(valid_words)))
print(len(all_lower_words))

234377
49815
294
261822


### Read text of the book

In [8]:
with open(r'../Archived Files/flowers-for-algernon-2005.txt', encoding='utf-8') as reader:
    data = reader.read()

### Check if all chapters are in the text file

In [9]:
# List of the names of the chapters in the book; some chapter names are truncated
chapters = ['Progris riport 1', 'Progris riport 2', '3d progris riport',
            'Progris riport 4', 'Progris riport 5', 'Progris riport 6',
            'PROGRESS REPORT 7', 'PROGRESS REPORT 8', 'PROGRESS REPORT 9',
            'PROGRESS REPORT 10', 'PROGRESS REPORT 11', 'PROGRESS REPORT 12',
            'PROGRESS REPORT 13', 'PROGRESS REPORT 14', 'PROGRESS REPORT 15',
            'PROGRESS REPORT 16', 'PROGRESS REPORT 17']
chapters = [i.lower() for i in chapters]

# Note this is not 0-indexed (nor should it be, it's a dict)
chapters_dict = dict()
for chapter_index in range(0, len(chapters)):
    chapters_dict[chapter_index+1]= chapters[chapter_index]

# Check if all chapters are in the book
for chapter in chapters:
    current_count = (data.lower()).count(chapter)
    assert current_count == 1

### Place each chapter's text (and relevant info) into a list

In [10]:
list_of_chapters = []

for chapter in chapters_dict:
    
    start_chapter_index = (data.lower()).find(chapters_dict[chapter])
    
    if(chapter==len(chapters_dict)):
        end_chapter_index = len(data)
    else:
        end_chapter_index = (data.lower()).find(chapters_dict[chapter+1])
    
    chapter_number = chapter    
    chapter_name = chapters_dict[chapter]
    chapter_text = data[start_chapter_index:end_chapter_index]
    
    list_of_chapters.append((chapter_number, chapter_name, chapter_text))

### Define a custom tokenization function

In [11]:
## Make this a class and use inheritence/polymorphism

pattern = r'''[.|,|\-|\'|—|_|*|?|\\"|:|;|^|$|#|@|!|(|)|+|=|%|&|(\d)+]+'''
def custom_token(some_list):
    """
    Further tokenization of the text
    """
    new_list = []
    correct_abbreviations = ['dr.', 'mr.', 'mrs.', 'miss.', 'prof.']
    for word in some_list:
        match = re.search(pattern, word)
        if(match) and (word not in correct_abbreviations):
            split_list = re.sub(pattern, " ", word).strip().split(' ')
            new_list.extend(split_list)
        else:
            new_list.append(word)
    #remove empty tokens
    new_list = list(filter(None, new_list))
    
    return new_list

def custom_token_2(some_token):
    """
    Further tokenization of the a single token
    """
    correct_abbreviations = ['dr.', 'mr.', 'mrs.', 'miss.', 'prof.']

    match = re.search(pattern, some_token)
    if(match) and (some_token not in correct_abbreviations):
        split_list = re.sub(pattern, " ", some_token).strip().split(' ')
        split_list = list(filter(None, split_list))
    else:
        split_list = [some_token]
    
    return split_list

### Tokenize the words in each chapter

In [12]:
tokenized_list_of_chapters = []

for chapter in list_of_chapters:
    chapter_number = chapter[0]
    chapter_name = chapter[1]
    chapter_tokens_original = nltk.word_tokenize(chapter[2])
    chapter_tokens = nltk.word_tokenize(chapter[2].lower())
    chapter_tokens = custom_token(chapter_tokens)
    
    tokenized_list_of_chapters.append((chapter_number, chapter_name, chapter_tokens, chapter_tokens_original))

### Function to get sentence surrounding a word

In [None]:
def get_sentence(token_index: int, chapter_tokens_original: list, offset: int=5) -> str:
    """
    Returns the The surrounding words 
    before and after the token.
    
    Parameters
    ----------
    token_index : int
        The index of the token.

    chapter_tokens_original : list
        The tokens in the chapter.

    offset : int, default=5
        The number of words to include before and after the token_index.

    Returns
    -------
    str
        The surrounding words before and after the token. 
    """
    # get 11 words ahead
    if (token_index - offset < 0):
        start_index = 0
    else:
        start_index = token_index - offset + 1
        
    return " ".join(chapter_tokens_original[start_index:token_index+5])

### Get the surrounding sentence to each word; Tokenize all words (each row is a token)

In [None]:
all_words_tokenized = []
offset = 5
word_pos = 0

for chapter_info in tokenized_list_of_chapters:
    chapter_num = chapter_info[0]
    chapter_name = chapter_info[1]
    chapter_tokens = chapter_info[2]
    chapter_tokens_original = chapter_info[3]
    
    for token_index in range(0, len(chapter_tokens_original)):
        if(token_index > len(chapter_tokens_original) + offset):
            break

        sentence = get_sentence(token_index, chapter_tokens_original, offset)

        all_words_tokenized.append([chapter_num, chapter_name, word_pos, chapter_tokens_original[token_index], sentence])
        
        word_pos += 1



### Tokenize the words again (using the custom function)

In [None]:
final_list_tokenized = list()

for token_info in all_words_tokenized:
    
    chapter_number = token_info[0]
    chapter_name = token_info[1]
    word_pos = token_info[2]
    original_token = token_info[3]
    sentence = token_info[4]
    
    custom_tokens = custom_token_2(original_token.lower())
    
    for new_token in custom_tokens:
        final_list_tokenized.append((chapter_number, chapter_name, word_pos, new_token, sentence))

### Filter the final list of all words tokenized

In [None]:
# Sort based on word position
# sorted(final_list_tokenized, key=lambda x: x[2])

final_list_tokenized_with_filters = list()

for token_info in final_list_tokenized:
    word = token_info[3]
    if word not in all_lower_words:
        word_misspelled = True
    else:
        word_misspelled = False
    
    final_list_tokenized_with_filters.append((*token_info, word_misspelled))
    
final_list_tokenized_with_filters.sort(key=lambda x: x[2])


### Place into DataFrame

In [None]:
df_final = pd.DataFrame(final_list_tokenized_with_filters,
                       columns=['Chapter Number', 'Chapter Name', 'Word Position', 'Token', 'Sentence', 'Misspelled'])
df_final['Word Position'] = np.arange(1, len(df_final)+1)
df_final['Word Position in Chapter'] = df_final.groupby('Chapter Number')['Chapter Number'].rank(axis=1, method='first').astype(int)
df_final

In [None]:
def get_num_of_words_chapter(chapter_number: int) -> int:
    """
    Returns the number of words in a chapter.
    """
    return len(df_final[df_final['Chapter Number']==chapter_number])


def get_num_of_misspellings(chapter_number: int) -> int:
    """
    Returns the number of misspelled words in a chapter.
    """
    return len(df_final[(df_final['Chapter Number']==chapter_number) & (df_final['Misspelled']==True)])
        

### Generator function to manually create valid words list

In [None]:
def get_chapter_generator():
    """
    Generator function that yields chapter number
    and mispellings in that chapter
    
    Created to manually check mispellings
    """
    for chapter_number, chapter_name in chapters_dict.items():
        misspellings = list(set(df_final[(df_final['Chapter Number']==chapter_number) & (df_final['Misspelled']==True)]['Token']))
        
        yield ((chapter_number, chapter_name), misspellings)

In [None]:
my_gen = get_chapter_generator()

In [None]:
# x = next(my_gen)
# print(x[0])
# print(x[1])

### Do a sentiment analysis on each chapter

In [None]:
def get_nrc_data():
    """
    
    """
    nrc = '../Data/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt'
    count=0
    emotion_dict=dict()
    with open(nrc,'r') as f:
        all_lines = list()
        for line in f:
            if count < 46:
                count+=1
                continue
            line = line.strip().split('\t')
            if int(line[2]) == 1:
                if emotion_dict.get(line[0]):
                    emotion_dict[line[0]].append(line[1])
                else:
                    emotion_dict[line[0]] = [line[1]]
    return emotion_dict

emotion_dict = get_nrc_data()

def emotion_analyzer(text, emotion_dict=emotion_dict):
    """
    """
    #Set up the result dictionary
    emotions = {x for y in emotion_dict.values() for x in y}
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0

    #Analyze the text and normalize by total number of words
    total_words = len(text.split())
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1/len(text.split())
    return emotion_count

### Get chapter level summary

In [None]:
data_chapter_level = []

for chapter in list_of_chapters:
    chapter_number = chapter[0]
    text = str(chapter[2].replace('\n', ''))
    fre = textstat.flesch_reading_ease(text)
    fkg = textstat.flesch_kincaid_grade(text)
    si = textstat.smog_index(text)
    cli = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)
    dcrs = textstat.dale_chall_readability_score(text)    

    lwf = textstat.linsear_write_formula(text)
    gf = textstat.gunning_fog(text)
    ts = textstat.text_standard(text)

    sentence_count = textstat.sentence_count(text)
    
    num_of_words = get_num_of_words_chapter(chapter_number)
    misspellings = get_num_of_misspellings(chapter_number)
    
    data_chapter_level.append((chapter_number, num_of_words, sentence_count, misspellings,
                              fre, fkg, si, cli, ari, dcrs, lwf, gf, ts))
    
df_chapters = pd.DataFrame(data_chapter_level, columns=['Chapter Number', 'Number of Words', 'Number of Sentences', 'Misspelled Words',
                                                        'Flesch Reading Ease', 'Flesch Grade', 'Smog Index', 
                                                        'Coleman Liau Readability', 'Automated Readability Index',
                                                        'Dale-Chall Readability', 'Linsear Write Formula',
                                                        'Gunning Fog', 'Text Standard'])

In [None]:
def apply_spell_correct(token: str, misspelled: bool) -> str:
    """
    Returns the best guess at the corrected word given a word.
    """
    if(misspelled):
        return spell.correction(token)
    else:
        return token

In [None]:
df_final['Correction'] = df_final.apply(lambda x: apply_spell_correct(x['Token'], x['Misspelled']), axis=1)

In [None]:
# correct spellings
list_of_chapters_text = []
for chapter_number, chapter_name in chapters_dict.items():
    chapter_text = " ".join(list(df_final[(df_final['Chapter Number']==chapter_number)]['Correction']))
    list_of_chapters_text.append((chapter_number, chapter_name, chapter_text))

list_to_df_emotions = []
for chapter_info in list_of_chapters_text:
    list_to_df_emotions.append(emotion_analyzer(chapter_info[2]))
    
df_emotions = pd.DataFrame(list_to_df_emotions)
# add anger, surprise
df_emotions['all_pos']=(df_emotions['trust']+df_emotions['positive']+ df_emotions['joy']+ df_emotions['anticipation'])
df_emotions['all_neg']=(df_emotions['fear']+df_emotions['negative']+ df_emotions['disgust']+ df_emotions['sadness'])
df_emotions['net']=(df_emotions['all_pos']-df_emotions['all_neg'])

In [None]:
df_chapters = pd.concat([df_chapters, df_emotions], axis=1, sort=False)
df_chapters

### Get overall readability of entire book.

In [None]:
whole_book_str = ''

for chapter in list_of_chapters:
    whole_book_str += str(chapter[2].replace('\n', ''))

print(textstat.text_standard(whole_book_str))

### Export to csv

In [None]:
df_final.to_csv('Data/Flowers for Algernon - All word tokens.csv')
df_chapters.to_csv('Data/Flowers for Algernon - Chapter summaries.csv')

In [None]:
# .generate
# generate random text using a trigram model
# https://stackoverflow.com/questions/1150144/generating-random-sentences-from-custom-text-in-pythons-nltk