In [1]:
import re
import codecs
import pandas as pd
from textblob import TextBlob

pd.options.display.float_format = '{:20,.2f}'.format
pd.options.display.max_columns = 100
pd.options.display.max_rows = 500

In [2]:

def make_df(filename):
    # make list of chapters
    book = make_file(filename)
    
    # turn book into dataframe
    book_df = pd.DataFrame(book)
    book_df.columns = ['chapter_text']
    
    # add column with chapter's POV character
    book_df['chapter_name'] = book_df['chapter_text'].apply(chap_char)
    
    # add column with book title
    title = book_df.ix[0, 1]
    book_df['book_title'] = title
    
    # drop first row with book basic info
    book_df.drop(0, inplace=True)
    
    # add column with chapter number
    book_df['chapter_num'] = book_df.index
    book_df = book_df[['book_title', 'chapter_num', 'chapter_name', 'chapter_text']]
    
    return book_df

    
def make_file(filename):
    # read in book text by line
    with codecs.open(filename, mode='r') as f:
        txt = f.read().splitlines()
    
    # concatinate lines in the same chapter
    book = []
    # initialize to false for first chapter
    end = False 
    
    for i in range(0, len(txt) -1):
        # remove page numbers and chapter headers
        txt[i] = re.sub('Page [0-9]*', '', txt[i])
        txt[i] = re.sub('Chapter [0-9]*', '', txt[i])
        
        # skip blank lines
        if not txt[i] or re.match("^ *$", txt[i]):
            continue
        
        # find chapter begining which start with character name capitalized
        elif re.match("^[ A-Z]*$", txt[i]):
            if end:
                # append previous chapter to list (false for first chapter)
                book.append(chapter) 
            # start new chapter
            chapter = txt[i]
            # set end to true for subsequent chapters after first chapter
            end = True
        
        # if not begining of chapter, add lines to chapter
        else:
            chapter = chapter + txt[i]
    return book


def chap_char(x):
    # Find character name in all caps
    name = re.findall(r'^[ A-Z]*', x)
    
    # clean name
    name = name[0]
    name = name[:-1]
    name = name.strip()
    if name[-2:] == ' A':
        name = name[:-2]
        name = name.strip()
    
    return name

In [3]:
file1 = "GOTbooks/001ssb.txt"
file2 = "GOTbooks/002ssb.txt"
file3 = "GOTbooks/003ssb.txt"
file4 = "GOTbooks/004ssb.txt"
file5 = "GOTbooks/005ssb.txt"

game_of_thrones = make_df(file1)
clash_of_kings = make_df(file2)
storm_of_swords = make_df(file3)
feast_for_crows = make_df(file4)
dance_with_dragons = make_df(file5)

In [4]:
book_names = [game_of_thrones, clash_of_kings, storm_of_swords, feast_for_crows, dance_with_dragons]
ice_and_fire = pd.concat(book_names)
ice_and_fire.reset_index(inplace=True, drop=True)

In [5]:
ice_and_fire

Unnamed: 0,book_title,chapter_num,chapter_name,chapter_text
0,A GAME OF THRONES,1,PROLOGUE,"PROLOGUE ""We should start back,"" Gared urged a..."
1,A GAME OF THRONES,2,BRAN,"BRAN The morning had dawned clear and cold, wi..."
2,A GAME OF THRONES,3,CATELYN,CATELYN Catelyn had never liked this godswood....
3,A GAME OF THRONES,4,DAENERYS,DAENERYS Her brother held the gown up for her ...
4,A GAME OF THRONES,5,EDDARD,EDDARD The visitors poured through the castle ...
5,A GAME OF THRONES,6,JON,"JON There were times-not many, but a few-when ..."
6,A GAME OF THRONES,7,CATELYN,CATELYN Of all the rooms in Winterfell's Great...
7,A GAME OF THRONES,8,ARYA,ARYA Arya's stitches were crooked again. She f...
8,A GAME OF THRONES,9,BRAN,BRAN The hunt left at dawn. The king wanted wi...
9,A GAME OF THRONES,10,TYRION,TYRION Somewhere in the great stone maze of Wi...


In [7]:
ice_and_fire.chapter_name.value_counts()

TYRION                       44
JON                          42
DAENERYS                     31
ARYA                         27
CATELYN                      21
BRAN                         19
SANSA                        19
EDDARD                       15
JAIME                        15
CERSEI                       12
DAVOS                        12
SAMWELL                       9
BRIENNE                       8
THEON                         7
PROLOGUE                      5
REEK                          3
ALAYNE                        2
THE WATCHER                   1
THE REAVER                    1
VICTARION                     1
THE QUEENSGUARD               1
THE GRIFFIN REBORN            1
MELISANDRE                    1
THE WAYWARD BRIDE             1
THE LOST LORD                 1
THE PROPHET                   1
THE DROWNED MAN               1
THE PRINCE OF WINTERFELL      1
THE QUEENMAKER                1
A GHOST IN WINTERFELL         1
THE IRON SUITOR               1
THE CAPT

In [None]:
'''
THE WATCHER                   Areo Hotah
THE REAVER                    Victarion
THE QUEENSGUARD               Barristan Selmy
THE GRIFFIN REBORN            1
MELISANDRE                    1
THE WAYWARD BRIDE             1
THE LOST LORD                 1
THE PROPHET                   1
THE DROWNED MAN               1
THE PRINCE OF WINTERFELL      1
THE QUEENMAKER                1
A GHOST IN WINTERFELL         1
THE IRON SUITOR               1
THE CAPTAIN OF GUARDS         1
THE IRON CAPTAIN              1
THE SOILED KNIGHT             1
THE UGLY LITTLE GIRL          1
THE DRAGONTAMER               1
THE WINDBLOWN                 1
CAT OF THE CANALS             1
THE KINGBREAKER               1
THE SPURNED SUITOR            1
THE DISCARDED KNIGHT          1
THE SACRIFICE                 1
THE TURNCLOAK                 1
THE PRINCESS IN THE TOWER     1
THE BLIND GIRL                1
'''