# Homework 4

```yaml
Course:   DS 5001
Module:   04 Lab
Topic:    Homework 4
Author:   Andrew Avitabile
Date:     09 February 2024 (revised and improved)
```

**Purpose**: Demonstrate Hidden Markov Model applied to POS detection, including the Viterbi algorithm by hand. 

# Set Up

In [None]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px

In [None]:
import configparser

In [None]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [None]:
source_files = f'{data_home}/gutenberg/eliot-set'
data_prefix = 'eliot'

In [None]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [None]:
import sys
sys.path.append(local_lib)

In [None]:
from textparser import TextParser

# Inspect

In [None]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
x

# Register

We get each file and add to a library `LIB`.

In [None]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [None]:
book_data = []
for source_file_path in source_file_list:
    # Splitting file path by "\\" for Windows paths or "/" for Unix-like paths
    parts = source_file_path.split('\\') if '\\' in source_file_path else source_file_path.split('/')
    
    # Extracting book ID from the last part of the file name
    book_id = int(parts[-1].split('-')[-1].split('.')[0].replace('pg',''))
    
    # Extracting book title from the second-to-last part of the file name
    book_title = parts[-1].split('-')[0].replace('_', ' ')
    
    # Appending book data tuple to book_data list
    book_data.append((book_id, source_file_path, book_title))

In [None]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [None]:
LIB

In [None]:
book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
book_id

In [None]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [None]:
LIB

## Save Chapters

In [None]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [None]:
LIB

## Tokenize Corpus

We tokenize each book and add each `TOKENS` table to a list to be concatenated into a single `CORPUS`.

In [None]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [None]:
CORPUS = tokenize_collection(LIB)

## Extract some features for `LIB`

In [None]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

In [None]:
LIB.sort_values('book_len')

In [None]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()

# Exract VOCAB

Extract a vocabulary from the CORPUS as a whole

## Handle Anomalies

NLTK's POS tagger is not perfect -- note the classification of punctuation as nouns, verbs, etc. We remove these from our corups.

In [None]:
CORPUS[CORPUS.term_str == '']

In [None]:
CORPUS[CORPUS.term_str == ''].token_str.value_counts()

In [None]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [None]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [None]:
CORPUS

In [None]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [None]:
VOCAB

# Annotate VOCAB

## Get Max POS

Get the most frequently associated part-of-speech category for each word.

In [None]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [None]:
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

## Compute POS ambiguity

How many POS categories are associated with each word?

In [None]:
VOCAB['n_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack().count(1)
VOCAB['cat_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos_group.apply(lambda x: set(x))

In [None]:
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [None]:
VOCAB

## Add Stopwords

In [None]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [None]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [None]:
VOCAB[VOCAB.stop == 1].sample(10)

## Interlude: Stopword Stats

In [None]:
a = VOCAB.groupby('stop').n_chars.mean()
b = VOCAB.groupby('stop').n_pos.mean().sort_values(ascending=False)

In [None]:
pd.concat([a,b], axis=1)

In [None]:
VOCAB.groupby('n_chars').n_pos.mean()\
    .sort_index().plot.bar(rot=0);

In [None]:
VOCAB[VOCAB.stop == True].sort_values('n_pos', ascending=False)[['n_pos','cat_pos']].head(20)

In [None]:
X = CORPUS.merge(LIB.reset_index()[['book_id','author']], on='book_id')\
    .merge(VOCAB.reset_index()[['term_str', 'stop']], on='term_str')\
    .groupby(['author','stop']).agg('sum', numeric_only=True).unstack()
X.columns = X.columns.droplevel(0)

In [None]:
(X.T / X.T.sum()).T.style.background_gradient(axis=None)

## Add Stems

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

In [None]:
VOCAB.sample(10)

In [None]:
VOCAB[VOCAB.stem_porter != VOCAB.stem_snowball]

# Save

In [None]:
out_path = f'{output_dir}/{data_prefix}'

In [None]:
LIB.to_csv(f'{out_path}-LIB-eliot-set.csv')
VOCAB.to_csv(f'{out_path}-VOCAB-eliot-set.csv')
CORPUS.to_csv(f'{out_path}-CORPUS-eliot-set.csv')

# Questions

## 1. What regular expression did you use to chunk _Middlemarch_ into chapters?

In [None]:
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (145,   rf"(PRELUDE|CHAPTER\s+{roman}+)"), #This one is for Middlemarch
    (507,   rf'Chapter\s{roman}+'),
    (6688,  rf'Chapter\s{roman}+')
]

## 2. What is the title of the book that has the most tokens? 

In [None]:
max_token_num_by_book = CORPUS.groupby('book_id').apply(lambda x: x.index.get_level_values('token_num').max())
max_token_num_by_book_df = max_token_num_by_book.reset_index(name='max_token_num')
max_token_num_by_book_df

ADAM BEDE

## 3. How many chapter level chunks are there in this novel?

In [None]:
chap_id_count_by_book = CORPUS.reset_index().groupby('book_id')['chap_id'].nunique().reset_index(name='unique_chap_id_count')
chap_id_count_by_book

61

## 4. Among the three stemming algorithms -- Porter, Lancaster, and Snowball --  which is the most aggressive, in terms of the number of words associated with each stem?

Lancaster is the most aggressive stemmer

In [None]:
term_counts_porter = VOCAB.groupby(['stem_porter']).size().reset_index(name='term_count')
term_counts_porter['term_count'].describe()

In [None]:
term_counts_lancaster = VOCAB.groupby(['stem_lancaster']).size().reset_index(name='term_count')
term_counts_lancaster['term_count'].describe()

In [None]:
term_counts_snowball = VOCAB.groupby(['stem_snowball']).size().reset_index(name='term_count')
term_counts_snowball['term_count'].describe()

## 5. Using the most aggressive stemmer from the previous question, what is the stem with the most associated terms?

"cont"

In [None]:
term_counts_lancaster[term_counts_lancaster['term_count'] == 34]