In [2]:
import re, warnings, pickle
from pathlib import Path
import numpy as np

from nltk import pos_tag, word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import words

### Read in books

In [3]:
def read_in_books(path:'str',lines_to_skip:int)->str:
    """Used to read in books

    Args:
        path (str): path to book location
        lines_to_skip (int): number of lines to skip

    Returns:
        str: raw book text
    """
    with open(path, 'r') as file:
        for _ in range(lines_to_skip):
            next(file)
        return file.read()

In [4]:
## Defoe books ##
# Robinson Crusoe
rc_text = read_in_books('../books/daniel_defoe/robinson_crusoe.txt',12)
# Buccaneers and Marooners
bm_text = read_in_books('../books/daniel_defoe/buccaneers_and_marooners.txt',16)
# Captain Singleton
cs_text = read_in_books('../books/daniel_defoe/captain_singleton.txt',4)


## Swift books ##
# Gulliver's Travels
gt_text = read_in_books('../books/jonathan_swift/gullivers_travels.txt', 28)
# Tale of a tub
tot_text = read_in_books('../books/jonathan_swift/tale_of_a_tub.txt', 249)


## Who is this ##
# General History of Pyrates
pyrates_text = read_in_books('../books/gen_history_of_the_pyrates.txt', 52)


### Helper functions

In [5]:
def mist_text_data_prep(text:str, cutoff:int = 50)->list:
    """Used to read in, clean and keep Mist Weekly Journal Sentences

    Args:
        text (str): original text
        cutoff (int, optional): Cutoff length. Defaults to 50.

    Returns:
        list: List of sentences to keep
    """

    cleaned_text = re.sub(r'[^a-zA-Z0-9\s[:punct:]]', '', text)
    sentences = sent_tokenize(cleaned_text)
    
    cleaned_sentences = []
    valid_words = set(words.words())

    for s in sentences:
        last_item = s[-1:]
        if last_item not in ['.','!',';','?']:
            last_item = ''

        tokens = s.split()
        
        filtered_tokens = [word for word in tokens if word.lower() in valid_words]
        
        filtered_text = ' '.join(filtered_tokens)

        filtered_text+=last_item

        if len(filtered_tokens)>=cutoff:

            cleaned_sentences.append(filtered_text)
    
    return cleaned_sentences, len(sentences)

In [6]:
def check_proper_nouns(text: str)->list:
    """Checks for proper nouns in a given text using NLTK.

    Args:
        text (str): _description_

    Returns:
        list: _description_
    """

    tokens = word_tokenize(text)
    tagged = pos_tag(tokens)

    proper_nouns = [word for word, pos in tagged if pos == 'NNP']

    return proper_nouns

In [7]:
def fix_capitalization(text:str)-> str:
    """Some books have very poor capitalization so this looks and decides
    if words are proper nouns and should be capitalized

    Args:
        text (str): text too look at

    Returns:
        str: Properly capitalized text
    """
    
    proper_nouns = check_proper_nouns(text)

    text = text.lower()
    text = text[0].upper() + text[1:]
    
    for noun in proper_nouns:
        pattern = re.compile(r'\b' + re.escape(noun.lower()) + r'\b', re.IGNORECASE)
        text = pattern.sub(noun, text)

    text = text.replace(' i ',' I ')
    text = text.replace(" i'm "," I'm ")

    return text

In [8]:
def find_words_starting_with_capital(text:str)->list:
    """Used for EDA to find words with capital letters

    Args:
        text (str): text to look at

    Returns:
        list: list of capital words
    """
    return re.findall(r'\b[A-Z][a-z]*\b', text)

In [9]:
def text_data_prep(text:str, book:str)->list:
    """This function preps the text for the RNN model. Each text has its own process as they are
    very different

    Args:
        text (str): the text of the book
        book (str): the book

    Returns:
        list: list of setences
    """
    sentences = sent_tokenize(text)

    cleaned_sentences = [s.replace('\n',' ') for s in sentences]
    cleaned_sentences = [s.replace('_','') for s in cleaned_sentences]

    if book == 'rc':
        pass

    elif book == 'gt':
        pass

    elif book == 'bm':
        pass

    elif book == 'cs':
        pass

    elif book == 'tot':
        pass

    elif book == 'pyrates':

        cleaned_sentences = [fix_capitalization(s) for s in cleaned_sentences]

    else:
        warnings.warn('The book name provided - {} - has not been reviewed.'.format(book), UserWarning)


    return cleaned_sentences

In [10]:
def keep_sentence(sentence_list:list, cutoff: int=50)->list:
    """Determines sentences to keep based off their length

    Args:
        sentence_list (list): list of sentences from your text
        cutoff (int, optional): How many tokens the sentence must have to keep. Defaults to 50.

    Returns:
        list: list of sentences to keep
    """
    return_list = []
    for s in sentence_list:
        tokens = word_tokenize(s)
        if len(tokens)>cutoff:
            return_list.append(s)

    return return_list

### Execution

In [11]:
rc_sentence_list = text_data_prep(rc_text, book = 'rc')
pyrates_sentence_list = text_data_prep(pyrates_text, book = 'pyrates')
gt_sentence_list = text_data_prep(gt_text, book = 'gt')
tot_sentence_list = text_data_prep(tot_text, book = 'tot')
bm_sentence_list = text_data_prep(bm_text, book = 'bm')
cs_sentence_list = text_data_prep(cs_text, book = 'cs')

In [12]:
rc_50 = keep_sentence(rc_sentence_list)
cs_50 = keep_sentence(cs_sentence_list)
bm_50 = keep_sentence(bm_sentence_list)
tot_50 = keep_sentence(tot_sentence_list)
gt_50 = keep_sentence(gt_sentence_list)
pyrates_50 = keep_sentence(pyrates_sentence_list)

In [13]:
print(len(rc_50),len(rc_sentence_list))
print(len(cs_50),len(cs_sentence_list))
print(len(bm_50),len(bm_sentence_list))
print(len(tot_50),len(tot_sentence_list))
print(len(gt_50),len(gt_sentence_list))
print(len(pyrates_50),len(pyrates_sentence_list))

2178 3703
1087 2346
982 2874
375 1087
809 2603
1126 2874


In [14]:
mist_text_paths = Path('../books/mist_weekly_journal_text').glob('*')
mist_text_files = [f for f in mist_text_paths]

In [15]:
mist_50 = []
num_sentences = 0
for f in mist_text_files:

    with open(f, 'r') as file:
        mist_text = file.read()

    cleaned_50, temp_num = mist_text_data_prep(mist_text)

    mist_50.extend(cleaned_50)
    num_sentences+=temp_num

In [16]:
len(mist_50), num_sentences

(615, 19363)

### Save dataset for model

In [21]:
with open('model_data/pyrates.pkl','wb') as file:
    pickle.dump(pyrates_50,file)

In [40]:
sentences = rc_50 + cs_50 + bm_50  + mist_50 + tot_50 + gt_50
labels = [0]*(len(rc_50) + len(cs_50) + len(bm_50)) + [1]*len(mist_50) +[2]*(len(tot_50) + len(gt_50))

In [55]:
sentences = rc_50 + bm_50  + mist_50 + tot_50 + gt_50
labels = [0]*(len(rc_50) + len(bm_50)) + [1]*len(mist_50) +[2]*(len(tot_50) + len(gt_50))

In [56]:
data_set_dict = {'sentences':sentences,'labels':labels}
with open('model_data/dataset.pkl', 'wb') as file:
    pickle.dump(data_set_dict,file)

### Save a more balanced dataset for model

In [57]:
len(rc_50 + bm_50)

3160

In [58]:
len(mist_50)

615

In [59]:
len(tot_50+gt_50)

1184

In [17]:
all_df = rc_50+bm_50
sample_df = np.random.choice(all_df, size=615, replace=False)

In [18]:
all_js = tot_50+gt_50
sample_js = np.random.choice(all_js, size=615, replace=False)

In [68]:
sample_sentences = sample_df.tolist()+mist_50+sample_js.tolist()
sample_lables = [0]*(len(sample_df.tolist())) + [1]*len(mist_50) +[2]*(len(sample_js.tolist()))

sample_data_set_dict = {'sentences':sample_sentences,'labels':sample_lables}
with open('model_data/sample_dataset.pkl', 'wb') as file:
    pickle.dump(sample_data_set_dict,file)

In [19]:
sample_sentences = sample_df.tolist()+mist_50
sample_lables = [0]*(len(sample_df.tolist())) + [1]*len(mist_50)

sample_data_set_dict = {'sentences':sample_sentences,'labels':sample_lables}

with open('model_data/sample_dataset_focused.pkl', 'wb') as file:
    pickle.dump(sample_data_set_dict,file)