## Preliminaries

In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import re
from collections import defaultdict

In [54]:
# "^CHAPTER\s(?:ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN)$"
chapter_names = ["CHAPTER ONE", "CHAPTER TWO", "CHAPTER THREE", "CHAPTER FOUR", "CHAPTER FIVE", "CHAPTER SIX",
                 "CHAPTER SEVEN", "CHAPTER EIGHT", "CHAPTER NINE", "CHAPTER TEN", "CHAPTER ELEVEN", "CHAPTER TWELVE",
                 "CHAPTER THIRTEEN", "CHAPTER FOURTEEN", "CHAPTER FIFTEEN"]

with open('Books/Text Files/Busby.txt') as f:
    lines = f.readlines()

df = pd.DataFrame({'text': lines})
df = df.replace('\n', '', regex = True) # just formatting it a bit bud
df['text'] = df['text'].str.strip()
df = df[~df.apply(lambda row: row.str.contains('^\s*$', regex=True)).all(axis=1)]
df = df.reset_index()
df = df.iloc[: , 1:]

In [55]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    chapter_rows = [index for index, text in df.iloc[:, 0].iteritems() if text in chapter_names]

i = 1
row_dicts = dict()
for row_num in chapter_rows: # row_dicts is a dictionary that stores chapter number to row.
    row_dicts[i] = row_num
    i += 1

## Chapters

In [56]:
chapter_contents = {}
current_chapter = None
current_chapter_start = None
for i, row in df.iterrows():
    # check if this row starts a new chapter
    if i - 1 in row_dicts.values():
        # if this row starts a new chapter, update the current chapter and its start line
        current_chapter = list(row_dicts.keys())[list(row_dicts.values()).index(i - 1)]
        current_chapter_start = i - 1
        chapter_contents[current_chapter] = ""
    # if we're in the middle of a chapter, add the row contents to the current chapter's contents
    if current_chapter is not None:
        chapter_contents[current_chapter] += row['text'] + " "

In [57]:
CHAPS = pd.Series(chapter_contents).to_frame()
CHAPS = CHAPS.reset_index().rename(columns = {"index": "chap_num"}).set_index("chap_num")
CHAPS.rename(columns = {0: "chap_str"}, inplace = 1)

In [59]:
CHAPS

Unnamed: 0_level_0,chap_str
chap_num,Unnamed: 1_level_1
1,"In 1993, the 84th year of Sir Matt Busby’s lif..."
2,A couple of dark hours before the dawn of 26 M...
3,MANCHESTER CITY AND LIVERPOOL THE BLUES As the...
4,"WAR, PEACE AND MANCHESTER UNITED THE COMMON TO..."
5,THE 1948 TEAM: ROCKY ROAD TO WEMBLEY FIT AS BU...
6,THE 1948 TEAM: TOUGH AT THE TOP THE MONEY WILL...
7,THE 1958 TEAM: BIRTH OF THE BABES TO DUDLEY AN...
8,THE 1958 TEAM: FLOODLIT NIGHTS THE LETTER Arou...
9,"MUNICH BE LIKE THAT, DAD The party flew to Bel..."
10,THE LONG JOURNEY BACK TROUBLE AHEAD As the 195...


## Paragraphs

In [60]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']
para_pat = r'\n\n+'

PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
    .to_frame('para_str').sort_index()
PARAS.index.names = OHCO[:2]
PARAS['para_str'] = PARAS['para_str'].str.strip()

In [61]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    chapter_paragraphs = {}
    current_chapter = None
    current_chapter_start = None
    paragraph_number = 0
    df_paragraphs = pd.DataFrame(columns=['chapter', 'paragraph', 'text'])
    for i, row in df.iterrows():
        # check if this row starts a new chapter
        if i - 1 in row_dicts.values():
            # if this row starts a new chapter, update the current chapter and its start line
            current_chapter = list(row_dicts.keys())[list(row_dicts.values()).index(i - 1)]
            current_chapter_start = i - 1
            chapter_paragraphs[current_chapter] = ""
            paragraph_number = 0
        # if we're in the middle of a chapter, add the row contents to the current chapter's paragraphs
        if current_chapter is not None:
            paragraph_number += 1
            paragraph_text = row['text']
            chapter_paragraphs[current_chapter] += paragraph_text + "\n"
            df_paragraphs = df_paragraphs.append({'chapter': current_chapter, 'paragraph': paragraph_number, 
                                                  'text': paragraph_text}, ignore_index = True)

In [62]:
PARAS = df_paragraphs
PARAS.rename(columns = {"chapter": "chap_num", "paragraph": "para_num", "text": "para_str"}, inplace = 1)
PARAS.set_index(["chap_num", "para_num"], inplace = True)

In [63]:
PARAS.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,para_str
chap_num,para_num,Unnamed: 2_level_1
1,1,"In 1993, the 84th year of Sir Matt Busby’s lif..."
1,2,"United, now managed by Alex Ferguson with Busb..."
1,3,When the bus arrived at their hotel near Wembl...
1,4,"Early in the next year, Busby died and Collins..."
1,5,I last saw him walking off the Wembley pitch. ...
1,6,Sir Alex Ferguson was another eloquent witness...
1,7,"our supporters were in a frenzy, battering the..."
1,8,The analogy resonated with the former United p...
1,9,Even those with no interest in football were l...
1,10,"He was, of course, one of the greatest British..."


## Sentences and Tokens

In [64]:
SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\ # M04
        .stack()\
        .to_frame('sent_str')
SENTS.index.names = OHCO[:3]

In [65]:
SENTS.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
chap_num,para_num,sent_num,Unnamed: 3_level_1
1,1,0,"In 1993, the 84th year of Sir Matt Busby’s lif..."
1,1,1,At the behest of a television company he was t...
1,2,0,"United, now managed by Alex Ferguson with Busb..."
1,2,1,"With Bobby Charlton, Pat Crerand, Alex Stepney..."
1,2,2,Collins fondly recalled Busby ‘puffing his pip...
1,3,0,When the bus arrived at their hotel near Wembl...
1,3,1,"Denis Law, whom injury had denied a part in th..."
1,3,2,"‘Key for Sir Matt Busby, please,’ Law cheerful..."
1,3,3,Bobby Charlton guided Busby to his room and ha...
1,3,4,"‘Our bloody luck,’ said Crerand, who revered S..."


In [66]:
TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\ # M04
            .stack()\
            .to_frame('pos_tuple')
TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
TOKENS['term_str'] = TOKENS.token_str.str.lower()

import string
def remove_punct(text):
    return ''.join(char for char in text if char not in string.punctuation)

term_str = TOKENS["term_str"].apply(remove_punct)
TOKENS["term_str"] = term_str
TOKENS.index.set_names('token_num', level = 3, inplace = True)

In [82]:
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str
chap_num,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1,0,0,"(In, IN)",IN,In,in
1,1,0,1,"(1993,, CD)",CD,1993,1993
1,1,0,2,"(the, DT)",DT,the,the
1,1,0,3,"(84th, CD)",CD,84th,84th
1,1,0,4,"(year, NN)",NN,year,year
...,...,...,...,...,...,...,...
15,98,3,56,"(of, IN)",IN,of,of
15,98,3,57,"(Busby, NNP)",NNP,Busby,busby
15,98,3,58,"(would, MD)",MD,would,would
15,98,3,59,"(never, RB)",RB,never,never


## Word Count

In [68]:
word_count = defaultdict(int)
word_count_pos = defaultdict(lambda: defaultdict(int))
for idx, val in TOKENS.iterrows():
    term = val.term_str
    pos = val.pos
    word_count[term] += 1
    word_count_pos[term][pos] += 1

## LIB

In [86]:
from glob import glob
source_file_list = sorted(glob("Books/Text Files/**"))

In [93]:
mapping_books = {
    "Atkinson1.txt": {
        "Author": "Ron Atkinson",
        "Title": "The Manager"
    },
    "Atkinson2.txt": {
        "Author": "Wayne Barton",
        "Title": "Que Sera Sera: Manchester United Under Dave Sexton and Ron Atkinson"
    },
    "Busby2.txt": {
        "Author": "Eamon Dunphy",
        "Title": "A Strange Kind of Glory"
    },
    "Busby1.txt": {
        "Author": "Patrick Barclay",
        "Title": "Sir Matt Busby: The Definitive Biography"
    },
    "Charlton1.txt": {
        "Author": "Sir Bobby Charlton",
        "Title": "1966: My World Cup Story"
    },
    "Charlton2.txt": {
        "Author": "Sir Bobby Charlton",
        "Title": "My Manchester United Years"
    },
    "Ferguson1.txt": {
        "Author": "Daniel Taylor",
        "Title": "This Is The One"
    },
    "Ferguson2.txt": {
        "Author": "Patrick Barclay",
        "Title": "Football - Bloody Hell!: The Biography of Alex Ferguson"
    },
    "Ferguson3.txt": {
        "Author": "Sir Alex Ferguson",
        "Title": "My Autobiography"
    },
    "Keane.txt": {
        "Author": "Roy Keane",
        "Title": "The Second Half"
    },
    "Robson.txt": {
        "Author": "Bryan Robson",
        "Title": "Robbo - My Autobiography"
    },
    "Scholes.txt": {
        "Author": "Paul Scholes",
        "Title": "My Story"
    }
}

In [101]:
book_data = []

i = 0
for path in source_file_list:
    book_id = i
    file_name = path.split("Books/Text Files/")[1]
    title = mapping_books[file_name]["Title"]
    author = mapping_books[file_name]["Author"]
    
    book_data.append((book_id, path, file_name, title, author))
        
    i += 1

In [104]:
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path', "file_name", 'raw_title', "author"])\
    .set_index('book_id').sort_index()
LIB

Unnamed: 0_level_0,source_file_path,file_name,raw_title,author
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Books/Text Files/Atkinson1.txt,Atkinson1.txt,The Manager,Ron Atkinson
1,Books/Text Files/Atkinson2.txt,Atkinson2.txt,Que Sera Sera: Manchester United Under Dave Se...,Wayne Barton
2,Books/Text Files/Busby1.txt,Busby1.txt,Sir Matt Busby: The Definitive Biography,Patrick Barclay
3,Books/Text Files/Busby2.txt,Busby2.txt,A Strange Kind of Glory,Eamon Dunphy
4,Books/Text Files/Charlton1.txt,Charlton1.txt,1966: My World Cup Story,Sir Bobby Charlton
5,Books/Text Files/Charlton2.txt,Charlton2.txt,My Manchester United Years,Sir Bobby Charlton
6,Books/Text Files/Ferguson1.txt,Ferguson1.txt,This Is The One,Daniel Taylor
7,Books/Text Files/Ferguson2.txt,Ferguson2.txt,Football - Bloody Hell!: The Biography of Alex...,Patrick Barclay
8,Books/Text Files/Ferguson3.txt,Ferguson3.txt,My Autobiography,Sir Alex Ferguson
9,Books/Text Files/Keane.txt,Keane.txt,The Second Half,Roy Keane


In [107]:
chap_regex = {
    "Atkinson1.txt": "^(ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN|SIXTEEN|SEVENTEEN|EIGHTEEN|NINETEEN|TWENTY|TWENTY\-ONE)$",
    "Atkinson2.txt": "^(Old Hollywood|Que Sera, Sera|Challenges|Money, Money, Money|Money Spinner|Vision|Walk Through the Storm|Borrowed Time|On Good Terms)$",
    "Busby2.txt": "^\d+\.\s.+|Epilogue|Prologue",
    "Busby1.txt": "^CHAPTER\s(?:ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE|THIRTEEN|FOURTEEN|FIFTEEN)$",
    "Charlton1.txt": "^\d+\.\s.+",
    "Charlton2.txt": "^(1[0-9]|[1-9]|2[0-6])\.?\s+.+|Prologue|Epilogue$",
    "Ferguson1.txt": "^\w+(?: \w+)* \d{1,2}\.\d{1,2}\.\d{2}$",
    "Ferguson2.txt": "^(NO DOUBT ABOUT IT|IN THE BEGINNING|EAST STIRLINGSHIRE|SAINTS ALIVE: THE LOVE STREET YEARS|ABERDEEN|MANCHESTER UNITED: EARLY DAYS|UNITED: STEPS TO GREATNESS|UNITED: APRÈS MOI LE TREBLE|UNITED: THE ENCORE|UNITED: RONALDO AND ROONEY|THE LEGACY)$",
    "Ferguson3.txt": "\b(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twenty[- ]one|twenty[- ]two|twenty[- ]three|twenty[- ]four|twenty[- ]five)\b",
    "Keane.txt": "^(ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN|ELEVEN|TWELVE)$",
    "Robson.txt": "n/a",
    "Scholes.txt": "\b\d+\b"
}