# Homework 5

```yaml
Course:    DS 5001
Module:    M05 Homework
Author:    Andrew Avitabile
Date:      18 February 2024
```

# Set Up

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home'] 
output_dir = config['DEFAULT']['output_dir']
data_prefix = 'austen-melville'

In [3]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)

# Prepare the data

## Import tables

In [4]:
LIB = pd.read_csv(f"{output_dir}/{data_prefix}-LIB.csv").set_index('book_id')
CORPUS = pd.read_csv(f'{output_dir}/{data_prefix}-TOKEN.csv').set_index(OHCO).dropna()
VOCAB = pd.read_csv(f'{output_dir}/{data_prefix}-VOCAB.csv').set_index('term_str').dropna()

## Function to compute TFIDF

The function takes the inputs:
- CORPUS: The name of the CORPUS
- bag: OCHO-level [SENTS, PARAS, CHAPS, BOOKS]
- tf_method = [sum, max, log, raw, double_norm, binary]
- idf_method = [standard, max, smooth]

### 1. Show the function you created.

In [5]:
def compute_tfidf(CORPUS, bag, tf_method, idf_method):
    
    # Compute bag-of-words representation
    BOW = CORPUS.groupby(bags[bag]+['term_str']).term_str.count().to_frame('n') 
    
    # Create document-term matrix
    DTCM = BOW.n.unstack(fill_value=0)
    
    # Compute statistics for each document
    DOC = DTCM.sum(1).to_frame('n_tokens')
    DOC['n_types'] = DTCM.astype('bool').sum(1)
    DOC['pkr'] = DOC.n_types / DOC.n_tokens
    DOC = DOC.join(LIB[['author','title']])
    
    # Display top documents based on type-token ratio
    DOC.sort_values('pkr').head(20).style.background_gradient(cmap='YlGnBu')
    
    # Print chosen TF method
    print('TF method:', tf_method)
    # Compute term frequency (TF) based on selected method
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    
    # Compute document frequency (DF) for each term
    DF = DTCM.astype('bool').sum()
    
    # Compute total number of documents
    N = DTCM.shape[0]
    # Print chosen IDF method
    print('IDF method:', idf_method)
    # Compute inverse document frequency (IDF) based on selected method
    if idf_method == 'standard':
        IDF = np.log2(N / DF)
    elif idf_method == 'max':
        IDF = np.log2(DF.max() / DF)
    elif idf_method == 'smooth':
        IDF = np.log2((1 + N) / (1 + DF)) + 1
    
    # Compute TF-IDF matrix
    TFIDF = TF * IDF
    
    return TFIDF 

### 2. What are the top 20 words in the corpus by TFIDF mean using the `max` count method and `book` as the bag?

In [6]:
#Run function
TFIDF = compute_tfidf(CORPUS, bag='BOOKS', tf_method='max', idf_method='standard')

#Compute mean TF-IDF scores for each term
mean_tfidf = TFIDF.mean()

#Sort terms by mean TF-IDF scores and select the top 20
top_20_words = mean_tfidf.sort_values(ascending=False).head(20)
top_20_words

TF method: max
IDF method: standard


term_str
elinor        0.035065
pierre        0.031845
vernon        0.026919
marianne      0.021992
emma          0.021686
darcy         0.020000
reginald      0.019154
babbalanja    0.018803
frederica     0.018637
catherine     0.018472
crawford      0.018391
elliot        0.017670
fanny         0.017492
weston        0.017191
media         0.016469
israel        0.015808
knightley     0.015733
tilney        0.014315
elton         0.014142
bingley       0.013744
dtype: float64

### 3. What are the top 20 words in the corpus by TFIDF mean, if you using the `sum` count method and  `chapter` as the bag? Note, because of the greater number of bags, this will take longer to compute.

In [7]:
#Run function
TFIDF = compute_tfidf(CORPUS, bag='CHAPS', tf_method='sum', idf_method='standard')

#Compute mean TF-IDF scores for each term
mean_tfidf = TFIDF.mean()

#Sort terms by mean TF-IDF scores and select the top 20
top_20_words = mean_tfidf.sort_values(ascending=False).head(20)
top_20_words

TF method: sum
IDF method: standard


term_str
her             0.004280
she             0.004266
cosmopolitan    0.003664
pierre          0.003448
you             0.002706
i               0.002623
hypothetical    0.002579
mr              0.002132
boon            0.001957
whale           0.001791
mrs             0.001780
charming        0.001767
thou            0.001758
and             0.001666
my              0.001638
lady            0.001638
me              0.001617
disciple        0.001602
charitable      0.001556
your            0.001508
dtype: float64

### 4. Characterize the general difference between the words in Question 3 and those in Question 2 in terms of part-of-speech.

The words in question 2 are all proper nouns, whereas those in question 3 are non-proper noun parts of speech (e.g., pronous: her, she, you, i)

### 5. Compute mean `TFIDF` for vocabularies conditioned on individual author, using *chapter* as the bag and `max` as the `TF` count method. Among the two authors, whose work has the most significant adjective?

In [8]:
#Run function
TFIDF = compute_tfidf(CORPUS, bag='CHAPS', tf_method='max', idf_method='standard')

TF method: max
IDF method: standard


In [13]:
VOCAB['mean_tfidf'] = TFIDF.mean()
VOCAB_JJ = VOCAB[VOCAB['max_pos_group'] == "JJ"]
VOCAB_JJ.mean_tfidf.sort_values(ascending=False).head(1)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,n_pos_group,cat_pos_group,n_pos,cat_pos,stop,stem_porter,stem_snowball,stem_lancaster,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ugh,127,3,6.5e-05,13.898738,JJ,JJ,5,"{'VB', 'JJ', 'RB', 'NN', 'IN'}",8,"{'VB', 'JJ', 'NNP', 'VBP', 'RB', 'NNS', 'NN', ...",0,ugh,ugh,ugh,0.012657


"ugh" is the most significant adjective. Who wrote "ugh"?

In [10]:
filtered_df = CORPUS[CORPUS['term_str']=="ugh"]
filtered_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
21816,60,9,0,0,"('""Ugh,', 'JJ')",JJ,"""Ugh,",ugh,JJ
21816,60,9,0,1,"('ugh', 'IN')",IN,ugh,ugh,IN
21816,60,17,3,16,"('ugh,', 'JJ')",JJ,"ugh,",ugh,JJ
21816,60,17,3,17,"('ugh,', 'NN')",NN,"ugh,",ugh,NN
21816,60,17,3,18,"('ugh!""', 'NN')",NN,"ugh!""",ugh,NN
21816,...,...,...,...,...,...,...,...,...
21816,66,68,2,6,"('ugh,', 'JJ')",JJ,"ugh,",ugh,JJ
21816,66,68,2,7,"('ugh!', 'NN')",NN,ugh!,ugh,NN
21816,66,68,3,5,"('ugh,', 'JJ')",JJ,"ugh,",ugh,JJ
21816,66,68,3,6,"('ugh,', 'JJ')",JJ,"ugh,",ugh,JJ


HERMAN MELVILLE