## Final Project: Tokenizing and Annotating Gothic Texts
    Course:   DS 5001
    Author:   Elizabeth Burrell
    Date:     April 2024

In [None]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import plotly_express as px

In [2]:
source_files = r"C:\Users\Student\Desktop\DS5001\data\gothic"

In [3]:
OHCO = ['book_id','chap_num','para_num', 'sent_num', 'token_num']

In [4]:
import pandas as pd
import numpy as np
import nltk

class TextParser():
    """
    A class to parse a single Gutenberg-type text file into a TOKENS dataframe with
    an OHCO index. Also has methods to extract a VOCAB table, although vocabulary
    tables ought to be generated at the corpus level.
    
    Sample parameter values:

    ohco_pats = [
        ('chapter', r"^\s*(chapter|letter)\s+(\d+)", 'm')    
    ]

    clip_pats = [
        r'START OF GUTENBERG PROJECT', 
        r'^\s*THE END'
    ]

    """

    # TODO: Make these private
    src_imported:bool = False       
    src_clipped:bool = False
    src_col_suffix:str ='_str'

    join_pat:str = r'\n'
    strip_hyphens:bool = False
    strip_whitespace:bool = False
    verbose:bool = False

    stanford_pos_model:str = "english-bidirectional-distsim.tagger"
    stanford_pos_model_path = None
        
    # We assume all OHCOs have sentences and tokens
    # and that there are terminal in the list.
    ohco_pats:[] = [
        ('para', r"\n\n", 'd'),
        ('sent', r"[.?!;:]+", 'd'),
        ('token', r"[\s',-]+", 'd')
    ]
        
    _ohco_type:{} = {
        'd': '_num',
        'm': '_id'
    }
        
    def __init__(self, src_file:str, ohco_pats:[], clip_pats:[], use_nltk=True):
        """Initialize the object and extract config info. If using NLTK, download resources."""
        self.src_file = src_file            
        self.clip_pats = clip_pats # TODO: Validate
        self.ohco_pats = ohco_pats + self.ohco_pats # TODO: Validate
        self.OHCO = [item[0]+self._ohco_type[item[2]] for item in self.ohco_pats]
        self.ohco_names = [item[0] for item in self.ohco_pats]
        self.use_nltk = use_nltk

        if self.use_nltk:
            # Override the last two OHCO items
            self.ohco_pats[-2] = ('sent', None, 'nltk')
            self.ohco_pats[-1] = ('token', None, 'nltk')
            # Make sure you have the NLTK stuff
            for package in [
                'tokenizers/punkt', 
                'taggers/averaged_perceptron_tagger', 
                'corpora/stopwords', 
                'help/tagsets'
            ]:
                if self.verbose: print("Checking", package)
                try:
                    nltk.data.find(package)
                except IndexError:
                    nltk.download(package)
            
    def import_source(self, strip:bool=True, char_encoding:str="utf-8-sig"):
        """Convert a raw text file into a dataframe of lines."""
        if self.verbose: print("Importing ", self.src_file)
        text_lines = open(self.src_file,'r', encoding=char_encoding).readlines()
        self.LINES = pd.DataFrame({'line_str':text_lines})
        self.LINES.index.name = 'line_id'
        if strip:
            self.LINES.line_str = self.LINES.line_str.str.strip()
        self.src_imported = True
        if self.verbose: print("Clipping text")
        self._clip_lines()
        return self        

    def _clip_lines(self):
        """Remove cruft lines from beginning and/or end of file."""
        start_pat = self.clip_pats[0]
        end_pat = self.clip_pats[1]
        start = self.LINES.line_str.str.contains(start_pat, regex=True)
        end = self.LINES.line_str.str.contains(end_pat, regex=True)
        try:
            start_line_num = self.LINES.loc[start].index[0]
        except IndexError:
            raise ValueError("Clip start pattern not found.")            
        try:
            end_line_num = self.LINES.loc[end].index[0]
        except IndexError:
            raise ValueError("Clip end pattern not found.")
        self.LINES = self.LINES.loc[start_line_num + 1 : end_line_num - 2]
        self.src_clipped == True
        
    def parse_tokens(self):
        """Convert lines to tokens based on OHCO."""
        if self.src_imported:

            # Start with the LINES df
            self.TOKENS = self.LINES.copy()

            # Walk through each level of the OHCO to build out TOKENS
            for i, level in enumerate(self.OHCO):

                if self.verbose: print(f"Parsing OHCO level {i} {level}", end=' ')

                # Define level-specific variables
                parse_type = self.ohco_pats[i][2]
                div_name = self.ohco_pats[i][0]
                div_pat = self.ohco_pats[i][1]
                if i == 0:
                    src_div_name = 'line'
                else:
                    src_div_name = self.ohco_names[i - 1] 
                src_col = f"{src_div_name}{self.src_col_suffix}"
                dst_col = f"{div_name}{self.src_col_suffix}"

                # By Milestone
                if parse_type == 'm':
                    if self.verbose: print(f"by milestone {div_pat}")
                    div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True) # TODO: Parametize case
                    self.TOKENS.loc[div_lines, div_name] = [i+1 for i in range(self.TOKENS.loc[div_lines].shape[0])]
                    self.TOKENS[div_name] = self.TOKENS[div_name].ffill()
                    self.TOKENS = self.TOKENS.loc[~self.TOKENS[div_name].isna()] 
                    self.TOKENS = self.TOKENS.loc[~div_lines] 
                    self.TOKENS[div_name] = self.TOKENS[div_name].astype('int')
                    self.TOKENS = self.TOKENS.groupby(self.ohco_names[:i+1], group_keys=True)[src_col]\
                        .apply(lambda x: '\n'.join(x)).to_frame(dst_col)

                    # print(self.TOKENS[dst_col].str.count(r'\n\n'))
                    print(src_col, dst_col)
                    print(self.TOKENS.columns)


                # By Delimitter
                elif parse_type == 'd':
                    if self.verbose: print(f"by delimitter {div_pat}")
                    self.TOKENS = self.TOKENS[src_col].str.split(div_pat, expand=True).stack().to_frame(dst_col)
                
                # By NLTK 
                elif parse_type == 'nltk':
                    if self.verbose: print(f"by NLTK model")

                    if level == 'sent_num':
                        self.TOKENS = self.TOKENS.para_str\
                                .apply(lambda x: pd.Series(nltk.sent_tokenize(x), dtype=str))\
                                .stack()\
                                .to_frame('sent_str')
                    
                    if level == 'token_num':
                        if self.strip_hyphens == True:
                            self.TOKENS.sent_str = self.TOKENS.sent_str.str.replace(r"-", ' ')
                        if self.strip_whitespace == True:
                            self.TOKENS = self.TOKENS.sent_str\
                                    .apply(lambda x: pd.Series(
                                            nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)),
                                            dtype='object'
                                        )
                                    )
                        else:
                            self.TOKENS = self.TOKENS.sent_str\
                                    .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))
                        self.TOKENS = self.TOKENS.stack().to_frame('pos_tuple')
                        self.TOKENS['pos'] = self.TOKENS.pos_tuple.apply(lambda x: x[1])
                        self.TOKENS['token_str'] = self.TOKENS.pos_tuple.apply(lambda x: x[0])
                        self.TOKENS['term_str'] = self.TOKENS.token_str.str.lower()   
        
                else:
                    raise ValueError(f"Invalid parse option: {parse_type}.")

                # After creating the current OHCO level
                self.TOKENS.index.names = self.OHCO[:i+1]

            # After iterating through the OHCO

            if not self.use_nltk:
                self.TOKENS['term_str'] = self.TOKENS.token_str.str.replace(r'[\W_]+', '', regex=True).str.lower()  
            else:
                punc_pos = ['$', "''", '(', ')', ',', '--', '.', ':', '``']
                self.TOKENS['term_str'] = self.TOKENS[~self.TOKENS.pos.isin(punc_pos)].token_str\
                    .str.replace(r'[\W_]+', '', regex=True).str.lower()  
            
        else:
            raise RuntimeError("Source not imported. Please run .import_source()")

    def extract_vocab(self):
        """This should also be done at the corpus level."""
        self.VOCAB = self.TOKENS.term_str.value_counts().to_frame('n')
        self.VOCAB.index.name = 'term_str'
        self.VOCAB['n_chars'] = self.VOCAB.index.str.len()
        self.VOCAB['p'] = self.VOCAB['n'] / self.VOCAB['n'].sum()
        self.VOCAB['s'] = 1 / self.VOCAB['p']
        self.VOCAB['i'] = np.log2(self.VOCAB['s']) # Same as negative log probability (i.e. log likelihood)
        self.VOCAB['h'] = self.VOCAB['p'] * self.VOCAB['i']
        self.H = self.VOCAB['h'].sum()
        return self

    def annotate_vocab(self):
        """This should be done at the corpus level."""
        # Stopwords
        # Max POS
        # POS variability
        # Porter Stems
        pass

    def extract_pos_data(self):
        # TODO: Create dataframe for POS info, including Penn Treebank info
        pass

    def extract_named_entities(self):
        # TODO: Create dataframe of named entities
        pass

    def gather_tokens(self, level=0, grouping_col='term_str', cat_sep=' '):
        """Gather tokens into strings for arbitrary OHCO level."""
        max_level = len(self.OHCO) - 2 # Can't gather tokens at the token level :)
        if level > max_level:
            raise ValueError(f"Level {level} too high. Try between 0 and {max_level}")
        else:
            level_name = self.OHCO[level].split('_')[0]
            idx = self.TOKENS.index.names[:level+1]
            return self.TOKENS.groupby(idx)[grouping_col].apply(lambda x: x.str.cat(sep=cat_sep))\
                .to_frame(f'{level_name}_str')


if __name__ == '__main__':
    pass

In [5]:
clip_pats = [
    r"\*\*\*\s*START OF",
    r"\*\*\*\s*END OF"
]

# All are 'chap'and 'm'
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (768,   rf"(?i)^\s*CHAPTER\s+{roman}\.?\s*$"),
    (3070,   rf"^Chapter\s+\d+$"),
    (1661,  rf"^\s*{roman}\.\s*$"),
    (345,   rf"(?i)^\s*CHAPTER\s+{roman}\.?\s*$"),
    (4078,   rf"(?i)^\s*CHAPTER\s+{roman}\.?\s*$")
]

## Register

In [6]:
source_file_list = sorted(glob(f"{source_files}/*.*"))

In [7]:
source_file_list

['C:\\Users\\Student\\Desktop\\DS5001\\data\\gothic\\BRONTE_EMILY_WURTHERING_HEIGHTS-pg768.txt',
 'C:\\Users\\Student\\Desktop\\DS5001\\data\\gothic\\DOYLE_ARTHURCONAN_THE_ADVENTURES_OF_SHERLOCK_HOLMES-pg1661.txt',
 'C:\\Users\\Student\\Desktop\\DS5001\\data\\gothic\\DOYLE_ARTHURCONAN_THE_HOUND_OF_BASKERVILLES-pg3070.txt',
 'C:\\Users\\Student\\Desktop\\DS5001\\data\\gothic\\STOKER_BRAM_DRACULA-pg345.txt',
 'C:\\Users\\Student\\Desktop\\DS5001\\data\\gothic\\WILDE_OSCAR_THE_PICTURE_OF_DORIAN_GRAY-pg4078.txt']

In [8]:
book_data = []
for source_file_path in source_file_list:
    book_id = int(source_file_path.split('-')[-1].split('.')[0].replace('pg',''))
    book_title = source_file_path.split('\\')[-1].split('-')[0].replace('_', ' ')
    book_data.append((book_id, source_file_path, book_title))

In [9]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','raw_title'])\
    .set_index('book_id').sort_index()

In [10]:
LIB

Unnamed: 0_level_0,source_file_path,raw_title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
345,C:\Users\Student\Desktop\DS5001\data\gothic\ST...,STOKER BRAM DRACULA
768,C:\Users\Student\Desktop\DS5001\data\gothic\BR...,BRONTE EMILY WURTHERING HEIGHTS
1661,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,DOYLE ARTHURCONAN THE ADVENTURES OF SHERLOCK H...
3070,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,DOYLE ARTHURCONAN THE HOUND OF BASKERVILLES
4078,C:\Users\Student\Desktop\DS5001\data\gothic\WI...,WILDE OSCAR THE PICTURE OF DORIAN GRAY


In [11]:
try:
    LIB['author'] = LIB.raw_title.apply(lambda x: ', '.join(x.split()[:2]))
    LIB['title'] = LIB.raw_title.apply(lambda x: ' '.join(x.split()[2:]))
    LIB = LIB.drop('raw_title', axis=1)
except AttributeError:
    pass

In [12]:
LIB

Unnamed: 0_level_0,source_file_path,author,title
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
345,C:\Users\Student\Desktop\DS5001\data\gothic\ST...,"STOKER, BRAM",DRACULA
768,C:\Users\Student\Desktop\DS5001\data\gothic\BR...,"BRONTE, EMILY",WURTHERING HEIGHTS
1661,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE ADVENTURES OF SHERLOCK HOLMES
3070,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE HOUND OF BASKERVILLES
4078,C:\Users\Student\Desktop\DS5001\data\gothic\WI...,"WILDE, OSCAR",THE PICTURE OF DORIAN GRAY


In [13]:
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

In [14]:
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
345,C:\Users\Student\Desktop\DS5001\data\gothic\ST...,"STOKER, BRAM",DRACULA,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$
768,C:\Users\Student\Desktop\DS5001\data\gothic\BR...,"BRONTE, EMILY",WURTHERING HEIGHTS,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$
1661,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE ADVENTURES OF SHERLOCK HOLMES,^\s*[IVXLCM]+\.\s*$
3070,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE HOUND OF BASKERVILLES,^Chapter\s+\d+$
4078,C:\Users\Student\Desktop\DS5001\data\gothic\WI...,"WILDE, OSCAR",THE PICTURE OF DORIAN GRAY,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$


## Tokenize Corpus

In [15]:
def tokenize_collection(LIB):

    clip_pats = [
        r"\*\*\*\s*START OF",
        r"\*\*\*\s*END OF"
    ]

    books = []
    for book_id in LIB.index:

        # Announce
        print("Tokenizing", book_id, LIB.loc[book_id].title)

        # Define vars
        chap_regex = LIB.loc[book_id].chap_regex
        ohco_pats = [('chap', chap_regex, 'm')]
        src_file_path = LIB.loc[book_id].source_file_path

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();
       
        # Name things
        text.TOKENS['book_id'] = book_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS

In [16]:
LIB.loc[345].chap_regex

'(?i)^\\s*CHAPTER\\s+[IVXLCM]+\\.?\\s*$'

In [17]:
CORPUS = tokenize_collection(LIB)

Tokenizing 345 DRACULA
Importing  C:\Users\Student\Desktop\DS5001\data\gothic\STOKER_BRAM_DRACULA-pg345.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone (?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 768 WURTHERING HEIGHTS
Importing  C:\Users\Student\Desktop\DS5001\data\gothic\BRONTE_EMILY_WURTHERING_HEIGHTS-pg768.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone (?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$
line_str chap_str
Index(['chap_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 1661 THE ADVENTURES OF SHERLOCK HOLMES
Importing  C:\Users\Student\Desktop\DS5001\data\gothic\DOYLE_ARTHURCONAN_THE_ADVENTURES_OF_SHERLOCK_HOLMES-pg1661.txt
Clipping text
Parsing OHCO 

In [18]:
LIB['book_len'] = CORPUS.groupby('book_id').term_str.count()

In [19]:
LIB.sort_values('book_len')

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3070,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE HOUND OF BASKERVILLES,^Chapter\s+\d+$,59561
4078,C:\Users\Student\Desktop\DS5001\data\gothic\WI...,"WILDE, OSCAR",THE PICTURE OF DORIAN GRAY,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$,79446
1661,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE ADVENTURES OF SHERLOCK HOLMES,^\s*[IVXLCM]+\.\s*$,105136
768,C:\Users\Student\Desktop\DS5001\data\gothic\BR...,"BRONTE, EMILY",WURTHERING HEIGHTS,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$,116441
345,C:\Users\Student\Desktop\DS5001\data\gothic\ST...,"STOKER, BRAM",DRACULA,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$,162859


In [20]:
LIB['n_chaps'] = CORPUS.reset_index()[['book_id','chap_id']]\
    .drop_duplicates()\
    .groupby('book_id').chap_id.count()

In [21]:
LIB

Unnamed: 0_level_0,source_file_path,author,title,chap_regex,book_len,n_chaps
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
345,C:\Users\Student\Desktop\DS5001\data\gothic\ST...,"STOKER, BRAM",DRACULA,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$,162859,27
768,C:\Users\Student\Desktop\DS5001\data\gothic\BR...,"BRONTE, EMILY",WURTHERING HEIGHTS,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$,116441,34
1661,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE ADVENTURES OF SHERLOCK HOLMES,^\s*[IVXLCM]+\.\s*$,105136,4
3070,C:\Users\Student\Desktop\DS5001\data\gothic\DO...,"DOYLE, ARTHURCONAN",THE HOUND OF BASKERVILLES,^Chapter\s+\d+$,59561,15
4078,C:\Users\Student\Desktop\DS5001\data\gothic\WI...,"WILDE, OSCAR",THE PICTURE OF DORIAN GRAY,(?i)^\s*CHAPTER\s+[IVXLCM]+\.?\s*$,79446,21


In [22]:
## handling anomalies
LIB[['book_len', 'n_chaps']].agg(('mean','sum'))

Unnamed: 0,book_len,n_chaps
mean,104688.6,20.2
sum,523443.0,101.0


In [23]:
CORPUS[CORPUS.term_str == ''].token_str.value_counts()

*        707
&         25
£         25
"         19
”         15
...?”      2
——         2
...”       1
”;         1
?"         1
!”         1
?”         1
!"         1
Name: token_str, dtype: int64

In [24]:
CORPUS = CORPUS[CORPUS.term_str != '']

In [25]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [26]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
345,1,0,0,0,"(JONATHAN, NNP)",NNP,JONATHAN,jonathan,NN
345,1,0,0,1,"(HARKER’S, NNP)",NNP,HARKER’S,harkers,NN
345,1,0,0,2,"(JOURNAL, NNP)",NNP,JOURNAL,journal,NN
345,1,1,0,0,"((_Kept, NN)",NN,(_Kept,kept,NN
345,1,1,0,1,"(in, IN)",IN,in,in,IN
...,...,...,...,...,...,...,...,...,...
4078,40,19,3,12,"(who, WP)",WP,who,who,WP
4078,40,19,3,13,"(it, PRP)",PRP,it,it,PR
4078,40,19,3,14,"(was., VBD)",VBD,was.,was,VB
4078,40,20,0,0,"(THE, DT)",DT,THE,the,DT


## Extracting Vocab

In [27]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

In [28]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16,1,0.000031,14.995464
10,7,2,0.000013,16.188109
100,3,3,0.000006,17.410501
1000,8,4,0.000015,15.995464
1018,1,4,0.000002,18.995464
...,...,...,...,...
à,4,1,0.000008,16.995464
æt,1,2,0.000002,18.995464
ætat,1,4,0.000002,18.995464
édition,1,7,0.000002,18.995464


## Annotate Vocab

In [29]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [30]:
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [31]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,16,1,0.000031,14.995464,JJ,JJ
10,7,2,0.000013,16.188109,JJ,JJ
100,3,3,0.000006,17.410501,CD,CD
1000,8,4,0.000015,15.995464,CD,CD
1018,1,4,0.000002,18.995464,CD,CD
...,...,...,...,...,...,...
à,4,1,0.000008,16.995464,NN,NN
æt,1,2,0.000002,18.995464,NN,NN
ætat,1,4,0.000002,18.995464,NNP,NN
édition,1,7,0.000002,18.995464,NN,NN


In [32]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [33]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [34]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

In [35]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stop,stem_porter
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16,1,0.000031,14.995464,JJ,JJ,0,1
10,7,2,0.000013,16.188109,JJ,JJ,0,10
100,3,3,0.000006,17.410501,CD,CD,0,100
1000,8,4,0.000015,15.995464,CD,CD,0,1000
1018,1,4,0.000002,18.995464,CD,CD,0,1018
...,...,...,...,...,...,...,...,...
à,4,1,0.000008,16.995464,NN,NN,0,à
æt,1,2,0.000002,18.995464,NN,NN,0,æt
ætat,1,4,0.000002,18.995464,NNP,NN,0,ætat
édition,1,7,0.000002,18.995464,NN,NN,0,édition


## Adding DFIDF

In [36]:
def generate_BOW(CORPUS, bag):
    
    BOW = CORPUS.groupby(bags[bag]+['term_str']).term_str.count().to_frame('n')

    return BOW

In [40]:
OHCO = ['book_id','chap_id','para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)

In [41]:
chaps = generate_BOW(CORPUS, 'CHAPS')

In [42]:
chaps

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
345,1,13000,1
345,1,1st,1
345,1,3,1
345,1,4,1
345,1,5,1
...,...,...,...
4078,40,yielded,1
4078,40,you,1
4078,40,young,2
4078,40,your,1


In [43]:
def calculate_dfidf(BOW):
   
    DTCM = BOW.n.unstack(fill_value=0)
    
    DF = DTCM.astype('bool').sum() 
    
    N = DTCM.shape[0]
    
    IDF = np.log2(N / DF)
    
    DFIDF = DF * IDF
    dfidf = pd.DataFrame(DFIDF).T
    
    return dfidf

In [55]:
df = calculate_dfidf(chaps)

In [56]:
df = df.transpose()
df

Unnamed: 0_level_0,0
term_str,Unnamed: 1_level_1
1,31.394578
10,24.439494
100,6.658211
1000,6.658211
1018,6.658211
...,...
à,18.632846
æt,6.658211
ætat,6.658211
édition,6.658211


In [57]:
VOCAB = VOCAB.join(df)

ValueError: columns overlap but no suffix specified: Index([0], dtype='object')

In [58]:
VOCAB.rename(columns={0: 'dfidf'}, inplace=True)

In [59]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stop,stem_porter,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,16,1,0.000031,14.995464,JJ,JJ,0,1,31.394578
10,7,2,0.000013,16.188109,JJ,JJ,0,10,24.439494
100,3,3,0.000006,17.410501,CD,CD,0,100,6.658211
1000,8,4,0.000015,15.995464,CD,CD,0,1000,6.658211
1018,1,4,0.000002,18.995464,CD,CD,0,1018,6.658211
...,...,...,...,...,...,...,...,...,...
à,4,1,0.000008,16.995464,NN,NN,0,à,18.632846
æt,1,2,0.000002,18.995464,NN,NN,0,æt,6.658211
ætat,1,4,0.000002,18.995464,NNP,NN,0,ætat,6.658211
édition,1,7,0.000002,18.995464,NN,NN,0,édition,6.658211


## 20 Most Significant Words by DFIDF

In [60]:
VOCAB.sort_values('dfidf', ascending=False).head(20)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stop,stem_porter,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
wonderful,95,9,0.000182,12.425608,JJ,JJ,0,wonder,53.60405
reach,56,5,0.000107,13.188109,VB,VB,0,reach,53.60405
promise,81,7,0.000155,12.655614,NN,NN,0,promis,53.60405
serious,66,7,0.000126,12.951069,JJ,JJ,0,seriou,53.60405
brain,93,5,0.000178,12.456305,NN,NN,0,brain,53.60405
street,150,6,0.000287,11.766645,NNP,NN,0,street,53.60405
telling,58,7,0.000111,13.137483,VBG,VB,0,tell,53.60405
creature,66,8,0.000126,12.951069,NN,NN,0,creatur,53.60405
seized,54,6,0.000103,13.240576,VBD,VB,0,seiz,53.60405
son,123,3,0.000235,12.052949,NN,NN,0,son,53.60405


## exporting corpus, vocab and lib

In [61]:
import configparser
config = configparser.ConfigParser()

In [62]:
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

In [63]:
data_prefix = 'gothic-texts'

In [64]:
out_path = f'{output_dir}/{data_prefix}'

In [67]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
345,1,0,0,0,"(JONATHAN, NNP)",NNP,JONATHAN,jonathan,NN
345,1,0,0,1,"(HARKER’S, NNP)",NNP,HARKER’S,harkers,NN
345,1,0,0,2,"(JOURNAL, NNP)",NNP,JOURNAL,journal,NN
345,1,1,0,0,"((_Kept, NN)",NN,(_Kept,kept,NN
345,1,1,0,1,"(in, IN)",IN,in,in,IN
...,...,...,...,...,...,...,...,...,...
4078,40,19,3,12,"(who, WP)",WP,who,who,WP
4078,40,19,3,13,"(it, PRP)",PRP,it,it,PR
4078,40,19,3,14,"(was., VBD)",VBD,was.,was,VB
4078,40,20,0,0,"(THE, DT)",DT,THE,the,DT


In [65]:
LIB.to_csv(f'{out_path}-LIB.csv')
VOCAB.to_csv(f'{out_path}-VOCAB.csv')
CORPUS.to_csv(f'{out_path}-CORPUS.csv')