# NLP and the Pipeline

```yaml
Course:   DS 5001
Module:   04 Lab
Topic:    NLP and the Pipeline
Author:   R.C. Alvarado
Date:     5 February 2023
```

**Purpose**:  We import a collection of texts and convert to F2. Then we annotate the collection to create an F3-level model.

# Set Up

## Config

To install plotly_express with conda:

```bash
conda install plotly::plotly_express 
```

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px

In [2]:
import configparser

In [3]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [4]:
source_files = f'{data_home}/gutenberg/austen-melville-set'
data_prefix = 'austen-melville'

In [5]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [6]:
import sys
sys.path.append(local_lib)

In [7]:
from textparser import TextParser

# Inspect

Since Project Gutenberg texts vary widely in their markup, we define our chunking patterns by hand.

In [8]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (158,   rf"^\s*CHAPTER\s+{roman}\s*$"),
    (946,   rf"^\s*{roman}\s*$"),
    (1212,  rf"^\s*LETTER .* to .*$"),
    (141,   rf"^CHAPTER\s+{roman}$"),
    (121,   rf"^CHAPTER\s+\d+$"),
    (105,   rf"^Chapter\s+\d+$"),
    (1342,  rf"^Chapter\s+\d+$"),
    (161,   rf"^CHAPTER\s+\d+$"),    
    (15422, rf"^\s*CHAPTER\s+{roman}\."),
    (13720, rf"^\s*CHAPTER\s+{roman}\s*$"),
    (13721, rf"^\s*CHAPTER\s+{roman}\s*$"),
    (2701,  rf"^(?:ETYMOLOGY|EXTRACTS|CHAPTER)"),
    (4045,  rf"^\s*CHAPTER\s+{roman}\.\s*$"),
    (34970, rf"^\s*{roman}\.\s*$"),
    (8118,  rf"^\s*{roman}\. .*$"),
    (21816, rf"^CHAPTER\s+{roman}\.?$"),
    (15859, rf"^\s*[A-Z,;-]+\.\s*$"),
    (1900,  rf"^CHAPTER "),
    (10712, rf"^CHAPTER\s+{roman}\.\s*$")
]

# Register

We get each file and add to a library `LIB`.

In [9]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [10]:
book_data = []
for source_file_path in source_file_list:
    # Splitting file path by "\\" for Windows paths or "/" for Unix-like paths
    parts = source_file_path.split('\\') if '\\' in source_file_path else source_file_path.split('/')
    
    # Extracting book ID from the last part of the file name
    book_id = int(parts[-1].split('-')[-1].split('.')[0].replace('pg',''))
    
    # Extracting book title from the second-to-last part of the file name
    book_title = parts[-1].split('-')[0].replace('_', ' ')
    
    # Appending book data tuple to book_data list
    book_data.append((book_id, source_file_path, book_title))

In [11]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()


In [12]:
LIB

Unnamed: 0_level_0,source_file_path,raw_title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
105,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE PERSUASION
121,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE NORTHANGER ABBEY
141,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE MANSFIELD PARK
158,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE EMMA
161,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE SENSE AND SENSIBILITY
946,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE LADY SUSAN
1212,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE LOVE AND FREINDSHIP SIC
1342,C:/Users/Andre/OneDrive - University of Virgin...,AUSTEN JANE PRIDE AND PREJUDICE
1900,C:/Users/Andre/OneDrive - University of Virgin...,MELVILLE HERMAN TYPEE A ROMANCE OF THE SOUTH SEAS
2701,C:/Users/Andre/OneDrive - University of Virgin...,MELVILLE HERMAN MOBY DICK OR THE WHALE


In [13]:
book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
book_id

10712

In [14]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [15]:
LIB

Unnamed: 0_level_0,source_file_path,author,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
105,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",PERSUASION
121,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",NORTHANGER ABBEY
141,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",MANSFIELD PARK
158,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",EMMA
161,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",SENSE AND SENSIBILITY
946,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",LADY SUSAN
1212,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",LOVE AND FREINDSHIP SIC
1342,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",PRIDE AND PREJUDICE
1900,C:/Users/Andre/OneDrive - University of Virgin...,"MELVILLE, HERMAN",TYPEE A ROMANCE OF THE SOUTH SEAS
2701,C:/Users/Andre/OneDrive - University of Virgin...,"MELVILLE, HERMAN",MOBY DICK OR THE WHALE


## Save Chapter regexes

In [16]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [17]:
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
105,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",PERSUASION,^Chapter\s+\d+$
121,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",NORTHANGER ABBEY,^CHAPTER\s+\d+$
141,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",MANSFIELD PARK,^CHAPTER\s+[IVXLCM]+$
158,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",EMMA,^\s*CHAPTER\s+[IVXLCM]+\s*$
161,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",SENSE AND SENSIBILITY,^CHAPTER\s+\d+$
946,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",LADY SUSAN,^\s*[IVXLCM]+\s*$
1212,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",LOVE AND FREINDSHIP SIC,^\s*LETTER .* to .*$
1342,C:/Users/Andre/OneDrive - University of Virgin...,"AUSTEN, JANE",PRIDE AND PREJUDICE,^Chapter\s+\d+$
1900,C:/Users/Andre/OneDrive - University of Virgin...,"MELVILLE, HERMAN",TYPEE A ROMANCE OF THE SOUTH SEAS,^CHAPTER
2701,C:/Users/Andre/OneDrive - University of Virgin...,"MELVILLE, HERMAN",MOBY DICK OR THE WHALE,^(?:ETYMOLOGY|EXTRACTS|CHAPTER)


## Tokenize Corpus

We tokenize each book and add each `TOKENS` table to a list to be concatenated into a single `CORPUS`.

In [18]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [19]:
LIB.loc[15859].chap_regex

'^\\s*[A-Z,;-]+\\.\\s*$'

In [20]:
CORPUS = tokenize_collection(LIB)

Tokenizing 105 PERSUASION
Importing  C:/Users/Andre/OneDrive - University of Virginia/Course Materials/Spring 2024/DS5001/data/gutenberg/austen-melville-set\AUSTEN_JANE_PERSUASION-pg105.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^Chapter\s+\d+$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 121 NORTHANGER ABBEY
Importing  C:/Users/Andre/OneDrive - University of Virginia/Course Materials/Spring 2024/DS5001/data/gutenberg/austen-melville-set\AUSTEN_JANE_NORTHANGER_ABBEY-pg121.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^CHAPTER\s+\d+$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 141 MANSFIELD PARK
Importing  C:/Users/Andre/OneDrive - University of


KeyboardInterrupt



## Extract some features for `LIB`

In [None]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

In [None]:
LIB.sort_values('book_len')

In [None]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()

# Explore

In [None]:
px.scatter(LIB, 'n_chaps', 'book_len', 
           color='author', text='n_chaps', size='book_len', 
           hover_name='title', width=800, height=500) 

In [None]:
LIB.groupby('author')[['book_len', 'n_chaps']].agg(('mean','sum'))

# Exract VOCAB

Extract a vocabulary from the CORPUS as a whole

## Handle Anomalies

NLTK's POS tagger is not perfect -- note the classification of punctuation as nouns, verbs, etc. We remove these from our corups.

In [None]:
CORPUS[CORPUS.term_str == '']

In [None]:
CORPUS[CORPUS.term_str == ''].token_str.value_counts()

In [None]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [None]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [None]:
CORPUS

In [None]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [None]:
VOCAB

# Annotate VOCAB

## Get Max POS

Get the most frequently associated part-of-speech category for each word.

In [None]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [None]:
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

## Compute POS ambiguity

How many POS categories are associated with each word?

In [None]:
VOCAB['n_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

In [None]:
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [None]:
VOCAB

In [None]:
# nltk.help.upenn_tagset()

## Add Stopwords

We use NLTK's built in stopword list for English. Note that we can add and subtract from this list, or just create our own list and keep it in our data model.

In [None]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [None]:
# sw.sample(10)

In [None]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [None]:
VOCAB[VOCAB.stop == 1].sample(10)

## Interlude: Stopword Stats

In [None]:
a = VOCAB.groupby('stop').n_chars.mean()
b = VOCAB.groupby('stop').n_pos.mean().sort_values(ascending=False)

In [None]:
pd.concat([a,b], axis=1)

In [None]:
VOCAB.groupby('n_chars').n_pos.mean()\
    .sort_index().plot.bar(rot=0);

Curious that stopwords would have such variability.

In [None]:
VOCAB[VOCAB.stop == True].sort_values('n_pos', ascending=False)[['n_pos','cat_pos']].head(20)

Anyway, let's compare stopword usage across authors.

In [None]:
X = CORPUS.merge(LIB.reset_index()[['book_id','author']], on='book_id')\
    .merge(VOCAB.reset_index()[['term_str', 'stop']], on='term_str')\
    .groupby(['author','stop']).agg('sum', numeric_only=True).unstack()
X.columns = X.columns.droplevel(0)

In [None]:
(X.T / X.T.sum()).T.style.background_gradient(axis=None)

## Add Stems

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

In [None]:
VOCAB.sample(10)

In [None]:
VOCAB[VOCAB.stem_porter != VOCAB.stem_snowball]

# Save

In [None]:
out_path = f'{output_dir}/{data_prefix}'

In [None]:
LIB.to_csv(f'{out_path}-LIB.csv')
VOCAB.to_csv(f'{out_path}-VOCAB.csv')
CORPUS.to_csv(f'{out_path}-CORPUS.csv')