# Preliminaries

In [364]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import re
from collections import defaultdict

# Data Model

## Generate Tables for Each Book

Unfortunately, due to the imparsibility of Mr. Dunphy's book, I'm omitting it from the analysis. Fantastic read, however.

In [365]:
from glob import glob
source_file_list = sorted(glob("Books/Text Files/**"))
LIB = pd.read_csv('Busby Example Notebooks/CSV Dataframes/LIB.csv').set_index("book_id")
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

LIB.iloc[3]["chap_regex"] = "nan"
LIB.iloc[4]["chap_regex"] = '^(1[0-7]|[1-9])\.|(Prologue|Epilogue)$' # Charlton's first
LIB.iloc[5]["chap_regex"] = '^(?:[1-9]|1[0-9]|2[0-9]|PROLOGUE|EPILOGUE)$' # Charlton's second
LIB.iloc[6]["chap_regex"] = '^\d{1,2}\.\d{1,2}\.\d{2}$' # Ferguson's first
LIB.iloc[8]["chap_regex"] = '^(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twenty[- ]one|twenty[- ]two|twenty[- ]three|twenty[- ]four|twenty[- ]five)' # Ferguson's third
LIB.iloc[10]["chap_regex"] = '^(1[0-7]|[1-9])\\.|(INTRODUCTION)$' # Robbo
LIB.iloc[11]["chap_regex"] = '(?<!\\d)(?:1[0-8]|[1-9])(?!\\d)' # Scholesy

#### CHAPS and PARAS

This incredibly long-winded piece of code effectively works in 6 parts:
1. Turn the .txt file into a Dataframe.
2. Filter that Dataframe to get rid of blank rows, \n's, and excess whitespace.
3. Use the predefined LIB regexes to find where chapters start (and end).
4. Use these markers to find the contents of each chapter.
5. Split the chapters into paragraphs by splitting via new lines and find the contents of each paragraph.
6. Store these paragraph Dataframes into a dictionary for further use.

In [366]:
dfs = dict()

iteration_count = 0
# Eamon Dunphy's booked is fucked.
import warnings
for file in source_file_list:
    with open(file) as f:
        lines = f.readlines()

    # These lines below simply break up the text into a Dataframe with an unnamed index equivalent to the line number.
    # The other column is the text contained in that line.
    # It's just a bunch of formatting here.
    df = pd.DataFrame({'text': lines})
    df = df.replace('\n', '', regex = True) # just formatting it a bit bud
    df['text'] = df['text'].str.strip()
    df = df[~df.apply(lambda row: row.str.contains('^\s*$', regex=True)).all(axis=1)]
    df = df.reset_index()
    df = df.iloc[: , 1:]
    
    # Get the regex I manually compiled and use it to split up the text into chapters.
    regex = LIB[LIB["source_file_path"] == file]["chap_regex"].get(key = iteration_count)
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        # Get the rows matching the start of each chapter.
        chapter_rows = [index for index, text in df.iloc[:, 0].iteritems() if re.match(regex, text)]
                
    if file != "Books/Text Files/Busby2.txt": # Skip Dunphy's book. Fantastic book but basically illegible .txt.
        i = 1
        row_dicts = dict()
        for row_num in chapter_rows: # row_dicts is a dictionary that stores chapter number to row.
            row_dicts[i] = row_num
            i += 1
            
        chapter_contents = {}
        current_chapter = None
        current_chapter_start = None
        for x, row in df.iterrows():
            # Check if this row starts a new chapter
            if x - 1 in row_dicts.values():
                # If this row starts a new chapter, update the current chapter and its start line
                current_chapter = list(row_dicts.keys())[list(row_dicts.values()).index(x - 1)]
                current_chapter_start = x - 1
                chapter_contents[current_chapter] = ""
            # If we're in the middle of a chapter, add the row contents to the current chapter's contents
            if current_chapter is not None:
                chapter_contents[current_chapter] += row['text'] + " "
             
        # Turning the dict into a DataFrame.
        CHAPS = pd.Series(chapter_contents).to_frame()
        CHAPS = CHAPS.reset_index().rename(columns = {"index": "chap_num"}).set_index("chap_num")
        CHAPS.rename(columns = {0: "chap_str"}, inplace = 1)
        
        para_pat = r'\n\n+'

        # Split the chapters based on whitespace into its constitituent paragraphs.
        PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
                      .to_frame('para_str').sort_index()
        PARAS.index.names = OHCO[:2]
        PARAS['para_str'] = PARAS['para_str'].str.strip() # Update the index names and fix the formatting a bit.
        
        # Do the same thing as above.
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            chapter_paragraphs = {}
            current_chapter = None
            current_chapter_start = None
            paragraph_number = 0
            df_paragraphs = pd.DataFrame(columns=['chapter', 'paragraph', 'text'])
            i = 0
            for i, row in df.iterrows():
                # check if this row starts a new chapter
                if i - 1 in row_dicts.values():
                    # if this row starts a new chapter, update the current chapter and its start line
                    current_chapter = list(row_dicts.keys())[list(row_dicts.values()).index(i - 1)]
                    current_chapter_start = i - 1
                    chapter_paragraphs[current_chapter] = ""
                    paragraph_number = 0
                # if we're in the middle of a chapter, add the row contents to the current chapter's paragraphs
                if current_chapter is not None:
                    paragraph_number += 1
                    paragraph_text = row['text']
                    chapter_paragraphs[current_chapter] += paragraph_text + "\n"
                    df_paragraphs = df_paragraphs.append({'chapter': current_chapter, 'paragraph': paragraph_number, 
                                                  'text': paragraph_text}, ignore_index = True)
                    
        PARAS = df_paragraphs
        PARAS.rename(columns = {"chapter": "chap_num", "paragraph": "para_num", "text": "para_str"}, inplace = 1)
        PARAS.set_index(["chap_num", "para_num"], inplace = True)
        
        SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x))).stack().to_frame('sent_str')
        SENTS.index.names = OHCO[:3]
        
        dfs[file] = SENTS
        
    iteration_count += 1

#### Examples

In [372]:
LIB["source_file_path"].to_list()

['Books/Text Files/Atkinson1.txt',
 'Books/Text Files/Atkinson2.txt',
 'Books/Text Files/Busby1.txt',
 'Books/Text Files/Busby2.txt',
 'Books/Text Files/Charlton1.txt',
 'Books/Text Files/Charlton2.txt',
 'Books/Text Files/Ferguson1.txt',
 'Books/Text Files/Ferguson2.txt',
 'Books/Text Files/Ferguson3.txt',
 'Books/Text Files/Keane.txt',
 'Books/Text Files/Robson.txt',
 'Books/Text Files/Scholes.txt']

In [373]:
dfs["Books/Text Files/Ferguson1.txt"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,Press conferences are usually held at Carringt...
1,1,1,If it is a Champions League week his briefings...
1,2,0,Today is the first time we have seen him this ...
1,2,1,United play Debrecen of Hungary in a Champions...
1,2,2,Everyone is happy to be back.
...,...,...,...
97,36,3,Of course he can be infuriating but he would s...
97,37,0,An educated guess is that Ferguson has two mor...
97,37,1,But it would be no surprise if he were still i...
97,37,2,Or maybe Neville’s correct and Ferguson will s...


In [379]:
dfs["Books/Text Files/Busby1.txt"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,"In 1993, the 84th year of Sir Matt Busby’s lif..."
1,1,1,At the behest of a television company he was t...
1,2,0,"United, now managed by Alex Ferguson with Busb..."
1,2,1,"With Bobby Charlton, Pat Crerand, Alex Stepney..."
1,2,2,Collins fondly recalled Busby ‘puffing his pip...
...,...,...,...
15,97,0,The Busby Babes
15,98,0,"Nearly 60 years after Munich, with José Mourin..."
15,98,1,"Paul Scholes, one of United’s most revered pla..."
15,98,2,But had clearly got to know the old man and un...


## Combining TOKENS Tables

In [380]:
paths = LIB["source_file_path"].to_list()
for i in range(12): # This adds the book_id to the outer index.
    if i == 3:
        continue
    df = dfs[paths[i]]
    
    new_levels = pd.Index([i] * len(df.index), name='book_id')
    new_index = pd.MultiIndex.from_arrays([new_levels] + [df.index.get_level_values(level) 
                                                          for level in range(df.index.nlevels)], names=['book_id'] + df.index.names)
    df.index = new_index

In [386]:
# Stack these Dataframes on top of one another to get a CORPUS.
combined = pd.DataFrame([]) # Combined sentences.
for path in paths:
    if path == "Books/Text Files/Busby2.txt": # Sorry Eamon Dunphy.
        continue
    combined = pd.concat([combined, dfs[path]])

In [387]:
combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sent_str
book_id,chap_num,para_num,sent_num,Unnamed: 4_level_1
0,1,1,0,THEY WERE ALIENS.
0,1,1,1,NOT THE ALIENS WE WOULD READ ABOUT in the comi...
0,1,1,2,They looked like gods.
0,1,2,0,"It was December 1954; I was fifteen, a ground-..."
0,1,2,1,I can still see Ferenc Puskas walking through ...
...,...,...,...,...
11,18,3,0,But for all the benefits that professional foo...
11,18,3,1,We are happy where we live – on the edge of Sa...
11,18,3,2,I’d like to echo the words used by Sir Bobby C...
11,18,3,3,"He said, ‘I’ve been a lucky, lucky lad.’ And t..."


In [389]:
combined_tokens = pd.DataFrame([])
for path in paths:
    if path == "Books/Text Files/Busby2.txt":
        continue
        
    SENTS = dfs[path]
        
    # M04
    # Turn the sentences in the current DFs dictionary into tokens, something I can actually use.
    TOKENS = SENTS.sent_str.apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))).stack().to_frame('pos_tuple')
    TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
    TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
    TOKENS['term_str'] = TOKENS.token_str.str.lower()

    # https://www.techiedelight.com/remove-non-alphanumeric-characters-string-python/
    import string
    def remove_nonalphanumeric(text):
        return ''.join(char for char in text if char.isalnum())

    term_str = TOKENS["term_str"].apply(remove_nonalphanumeric) # Remove non-alphanumeric characters in the Dataframe.
    TOKENS["term_str"] = term_str
    
    combined_tokens = pd.concat([combined_tokens, TOKENS])
    
combined_tokens.index.set_names(['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num'], inplace = True)

In [390]:
combined_tokens

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,1,0,0,"(THEY, NNP)",NNP,THEY,they
0,1,1,0,1,"(WERE, NNP)",NNP,WERE,were
0,1,1,0,2,"(ALIENS., NNP)",NNP,ALIENS.,aliens
0,1,1,1,0,"(NOT, NNP)",NNP,NOT,not
0,1,1,1,1,"(THE, NNP)",NNP,THE,the
...,...,...,...,...,...,...,...,...
11,18,3,3,14,"(me,, NNS)",NNS,"me,",me
11,18,3,3,15,"(too., NNS)",NNS,too.,too
11,18,4,0,0,"(THE, DT)",DT,THE,the
11,18,4,0,1,"(HEART, NNP)",NNP,HEART,heart


In [310]:
combined_tokens.to_csv('TOKENS.csv')

## Combining VOCAB Tables

In [311]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [337]:
combined_vocab = pd.DataFrame([])
for i in range(12):
    if i == 3:
        continue
        
    TOKENS = combined_tokens.loc[i]
    VOCAB = TOKENS.term_str.value_counts().to_frame('n')
    VOCAB.index.name = 'term_str'
    VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
    VOCAB['i'] = -np.log2(VOCAB.p)
    VOCAB['n_chars'] = VOCAB.index.str.len()
    
    # https://www.guru99.com/pos-tagging-chunking-nltk.html
    VOCAB['max_pos'] = TOKENS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
    VOCAB = VOCAB[VOCAB["n_chars"] > 0] # get rid of just the blank whitespace
    
    sw = pd.DataFrame({'stop': 1}, index=nltk.corpus.stopwords.words('english')) # marking stopwords mate
    sw.index.name='term_str'
    if 'stop' not in VOCAB.columns:
        VOCAB = VOCAB.join(sw)
        VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
        
    # m04
    from nltk.stem.porter import PorterStemmer
    stemmer1 = PorterStemmer()
    VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

    from nltk.stem.snowball import SnowballStemmer
    stemmer2 = SnowballStemmer("english")
    VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

    from nltk.stem.lancaster import LancasterStemmer
    stemmer3 = LancasterStemmer()
    VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)
    
    if 'term_rank' not in VOCAB.columns: # just adding a new column and ranking them
        VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
        VOCAB.index.name = 'term_rank' 
        VOCAB = VOCAB.reset_index()
        VOCAB = VOCAB.set_index('term_str')
        VOCAB['term_rank'] = VOCAB['term_rank'] + 1
    
    new_rank = VOCAB.n.value_counts()\
                .sort_index(ascending=False).reset_index().reset_index()\
                .rename(columns={'level_0':'term_rank2', 'index':'n', 'n':'nn'})\
                .set_index('n')
    VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1
    
    VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
    VOCAB['zipf_k2'] = VOCAB.p * VOCAB.term_rank2
    
    new_levels = pd.Index([i] * len(VOCAB.index), name = 'book_id')
    new_index = pd.MultiIndex.from_arrays([new_levels] + [VOCAB.index.get_level_values(level) 
                                                          for level in range(VOCAB.index.nlevels)], names=['book_id'] + VOCAB.index.names)
    VOCAB.index = new_index
    
    combined_vocab = pd.concat([combined_vocab, VOCAB])

## BOW and TFIDF

In [344]:
CHAPS = OHCO[:2]
bag = CHAPS

In [362]:
combined_BOW = pd.DataFrame([])
for i in range(12):
    if i == 3:
        continue
        
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        TOKENS = combined_tokens.loc[i]
        BOW = TOKENS.groupby(["chap_num"]+['term_str']).term_str.count().to_frame('n') 
        BOW = BOW.drop(BOW[BOW.index.get_level_values('term_str') == ''].index) # drop the whitespace mate
    
        N = DTCM.shape[0]
    
        DTCM = BOW.n.unstack().fillna(0).astype('int')
    
        TF = DTCM.T / DTCM.T.sum()
        TF = TF.T
    
        DF = DTCM.astype('bool').sum()
        IDF = np.log2(N / DF)
    
        TFIDF = TF * IDF
    
        DOC = DTCM.sum(1).to_frame('n_tokens')
        DOC['n_types'] = DTCM.astype('bool').sum(1)
    
        VOCAB = combined_vocab.iloc[i]
        VOCAB['df'] = DF
        VOCAB['idf'] = IDF
        VOCAB['tfidf_mean'] = TFIDF.mean() 
        VOCAB['tfidf_sum'] = TFIDF.sum()
        VOCAB['tfidf_median'] = TFIDF.median()
        VOCAB['tfidf_max'] = TFIDF.max()

        VOCAB['dfidf'] = VOCAB.df * VOCAB.idf
        VOCAB['dp'] = VOCAB.df / len(DOC)
        VOCAB['dh'] = VOCAB.dp * np.log2(1/VOCAB.dp)
        
        BOW['tf'] = TF.stack()
        BOW['tfidf'] = TFIDF.stack()
    
        new_levels = pd.Index([i] * len(BOW.index), name = 'book_id')
        new_index = pd.MultiIndex.from_arrays([new_levels] + [BOW.index.get_level_values(level) 
                                                          for level in range(BOW.index.nlevels)], names=['book_id'] + BOW.index.names)
        BOW.index = new_index
        
        combined_BOW = pd.concat([combined_BOW, BOW])

In [363]:
combined_BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tf,tfidf
book_id,chap_num,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1721yearolds,1,0.000364,0.001517
0,1,1950s,1,0.000364,0.001153
0,1,1953,3,0.001091,0.004551
0,1,1954,1,0.000364,0.001153
0,1,1956,1,0.000364,0.001153
...,...,...,...,...,...
11,18,wife,1,0.004785,0.004390
11,18,with,1,0.004785,-0.000395
11,18,without,1,0.004785,0.001852
11,18,words,1,0.004785,0.006125


## Save To CSV

In [326]:
combined_vocab.to_csv('VOCAB.csv')

# Data Model

In [333]:
TOKENS = pd.read_csv("TOKENS.csv").set_index(["book_id", "chap_num", "para_num", "sent_num", "token_num"])
VOCAB = pd.read_csv("VOCAB.csv").set_index(["book_id","term_str"])

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p,i,n_chars
book_id,term_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,the,3409,0.053979,4.211454,3
0,to,1828,0.028945,5.110536,2
0,and,1744,0.027615,5.178402,3
0,i,1553,0.024591,5.345744,1
0,a,1468,0.023245,5.426950,1
...,...,...,...,...,...
11,ditty,1,0.000017,15.874981,5
11,entitled,1,0.000017,15.874981,8
11,deduce,1,0.000017,15.874981,6
11,rudimentary,1,0.000017,15.874981,11
