In [7]:
from bs4 import BeautifulSoup
import urllib
import nltk
import re
from pprint import pprint
import csv
from collections import OrderedDict
import pandas as pd
import numpy as np
import unicodedata
from __future__ import print_function

In [3]:
# load in character list
character_list = []
char_list = open('char_list.txt', 'r')
for char in char_list:  
    character_list.append(char.replace('\n',''))
    
char_list.close()

In [4]:
# given a soup, this function pulls out text from the following sections:
# Background, Season [1-5]
def return_character_dictionary(soup, char, char_dict):
    # extracts the paragraphs for background, season 1, ...
    
    # get background separately "Background"
    for section in soup.findAll('span', text=re.compile("Background$")):
        nextNode = section
        text_tmp = ''
        foundTextFlag = False
        while True:
            if nextNode is None:
                break # ??

            nextNode = nextNode.next
            if foundTextFlag == True and \
            (nextNode is None or nextNode.name == 'h3' or nextNode.name == 'h2'):
                # check if this is a new section
                # if it is, we are done, and we should break
                #print('Done! Here is the text for {}:').format(section.text)
                #print(text_tmp)
                #print('*** end text ***')

                # process the text, removing any references
                # p is a compiled regex
                # s is a string  
                # s = p.sub(process_match, s)

                p = re.compile('\[\d+\]')
                text_tmp = p.sub('', text_tmp) # removes references
                p = re.compile(u'\n')
                text_tmp = p.sub(' ', text_tmp) # removes newlines
                char_dict[section.text][char] = text_tmp.replace(u'\xa0', u' ')
                break

            if nextNode.name == 'p':
                # found the text, now loop through until we find the next section
                text_tmp += ' ' + nextNode.text
                foundTextFlag = True
            else:
                continue

    for section in soup.findAll('span', text=re.compile("Season [1-5]$")):
        nextNode = section
        text_tmp = ''
        foundTextFlag = False
        while True:
            nextNode = nextNode.next
            if nextNode is None:
                break # ??

            if foundTextFlag == True and \
            (nextNode.name == 'h3' or nextNode.name == 'h2'):
                # check if this is a new section
                # if it is, we are done, and we should break
                #print('Done! Here is the text for {}:').format(section.text)
                #print(text_tmp)
                #print('*** end text ***')

                # process the text, removing any references
                p = re.compile('\[\d+\]')
                text_tmp = p.sub('', text_tmp)
                if section.text in char_dict.keys():
                    char_dict[section.text][char] = text_tmp.replace('\n', '').replace(u'\xa0', u' ')
                
                break

            if nextNode.name == 'p':
                # found the text, now loop through until we find the next section
                text_tmp += ' ' + nextNode.text
                foundTextFlag = True
            else:
                continue

    return char_dict

Now we will loop through all the characters in character list and pull:

* semi-structured data from the right side of their wiki page (detailed by a number of key words)
* unstructured text data from the background/season sections

In [101]:
# now using the above function, create a dictionary with all our characters and their text data
c = {'Season 1': {},
    'Season 2': {},
    'Season 3': {},
    'Season 4': {},
    'Season 5': {},
    'Background': {}}

# at the same time get other structured info
key_words = ["Season(s)", "First seen", "Last seen", "Appeared in", "Mentioned in", "Titles", "Also known as", "Status",
             "Age", "Date of birth", "Death", "Origin", "Allegiance", "Culture", "Religion", "Family", "Portrayed by"]

info = {}
# initialize dicionary with key words
for key in key_words:
    info[key] = {}
info['Name'] = {} # since we handle name separately, we decide not to include it in the key_words list
count_char = 1

for char in character_list:
    char = char.replace('\n','') # remove newline sometimes contained in character name
    
    if np.mod( character_list.index(char), 5 ) == 0:
        print('') #newline, i.e. only print 5 character names per line
    print ('{} .. '.format(char), end="")
        
    # get the data from the wiki page
    url = 'http://gameofthrones.wikia.com/wiki/' + char
    page = urllib.urlopen(url)
    soup = BeautifulSoup(page.read(), "html.parser")
    
    # Output structured info from the right side of the wiki page
    info['Name'][char] = char.replace('_',' ')

    for word in key_words:
        keys = soup.find(text=word)
        if keys is not None:
            value = keys.parent.findNextSibling('div')
            if value is not None:
                info[word][char] = value.text

            #TODO: special case for family, create a dictionary
    
    
    # call function to get text data
    c = return_character_dictionary(soup, char, c)


Alliser_Thorne ...Arya_Stark ...Barristan_Selmy ...Bran_Stark ...Brienne_of_Tarth ...
Bronn ...Brynden_Tully ...Catelyn_Stark ...Cersei_Lannister ...Children_of_the_Forest ...
Daario_Naharis ...Daenerys_Targaryen ...Davos_Seaworth ...Doran_Martell ...Dragons ...
Eddard_Stark ...Ellaria_Sand ...Euron_Greyjoy ...Faceless_Men ...Faith_Militant ...
Gendry ...Ghost ...Gilly ...Grey_Worm ...Hodor ...
Jaime_Lannister ...Jaqen_H'ghar ...Jon_Snow ...Jorah_Mormont ...Kevan_Lannister ...
Lancel_Lannister ...Loras_Tyrell ...Lord_of_Light ...Mace_Tyrell ...Margaery_Tyrell ...
Meera_Reed ...Melisandre ...Missandei ...Nymeria_(direwolf) ...Nymeria_Sand ...
Obara_Sand ...Olenna_Tyrell ...Olly ...Osha ...Petyr_Baelish ...
Podrick_Payne ...Qyburn ...Ramsay_Bolton ...Rickon_Stark ...Robin_Arryn ...
Roose_Bolton ...Samwell_Tarly ...Sansa_Stark ...Septon_Meribald ...Ser_Pounce ...
Sons_of_the_Harpy ...The_High_Sparrow ...The_Hound ...The_Mountain ...The_Old_Gods ...
The_Unsullied ...Theon_Greyjoy ...Three

In [98]:
# create data frames of the above
df_info_unicode = pd.DataFrame.from_dict(info)
df_txt_unicode = pd.DataFrame.from_dict(c, dtype='unicode')

In [None]:
# convert the unicode dataframe to ascii text - as best we can, this will inevitably remove/alter some characters
def convert_text(s):
    if str(type(s)) == "<type 'unicode'>":
        txt = unicodedata.normalize('NFKD', s).encode('ascii','ignore')
        return txt.replace("\\'", "'")  # remove escape characters
    else:
        return None

d = {}
for col in df_info_unicode.columns:
    d[col] = df_info_unicode[col].apply(convert_text)

df_info = pd.DataFrame(d)

d = {}
for col in df_txt_unicode.columns:
    d[col] = df_txt_unicode[col].apply(convert_text)

df_txt = pd.DataFrame(d)

In [15]:
# load in the scores from season 5
scores = pd.DataFrame.from_csv('season5_scores.csv')
scores.head()

Unnamed: 0_level_0,Killing,SexNudity,Insult,Drinking,Injury,Total
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aeron_Greyjoy,0,0,0,0,0,0
Areo_Hotah,0,0,7,0,4,11
Arianne_Martel,0,0,0,0,0,0
Arya_Stark,18,0,28,0,40,86
Balon_Greyjoy,0,0,0,0,0,0


In [99]:
# write data to file
df = scores
df = df.merge(df_info, left_index=True, right_index=True, how='left')
df = df.merge(df_txt, left_index=True, right_index=True, how='left')
df.head()

df.to_csv('got_data.csv')

Unnamed: 0_level_0,Killing,SexNudity,Insult,Drinking,Injury,Total,Age,Allegiance,Also known as,Appeared in,...,Religion,Season(s),Status,Titles,Background,Season 1,Season 2,Season 3,Season 4,Season 5
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aeron_Greyjoy,0,0,0,0,0,0,,,,,...,,,,,,,,,,
Areo_Hotah,0,0,7,0,4,11,,,,,...,,,,,,,,,,
Arianne_Martel,0,0,0,0,0,0,,,,,...,,,,,,,,,,
Arya_Stark,18,0,28,0,40,86,16 in Season 6[1],House StarkFaceless Men,ArryLanna,39 episodes (see below),...,Old Gods of the Forest (formerly)Faith of the ...,"1, 2, 3, 4, 5, 6",Alive,Princess,Arya as a little girl. Arya Stark is the youn...,Arya is being taught how to sew by Septa Mord...,Arya and Gendry hide from the Goldcloaks. Ary...,"Arya, Gendry and Hot Pie discovered by the Br...",Arya listens as the Hound and Polliver is dis...,Arya at the House of Black and White in Braav...
Balon_Greyjoy,0,0,0,0,0,0,,,,,...,,,,,,,,,,


The family field in the semi-structured data could be better parsed ... kind of made a start here but didn't get very far.


```python
# try to parse family
value = info['Family']
tmp = value.split('{')
tmp_dict = {} 
for tmpval in tmp:
    if tmpval == '' or tmpval is None:
        continue
    
    if ' - ' in tmpval:
        tmp_dict[ tmpval.split(' - ')[1] ] = tmpval.split('}')[0]
        print('{}     tmp_dict[{}] = {}').format(tmpval,  tmpval.split(' - ')[1] , tmpval.split('}')[0])
```