### WebScraper

In [None]:
# Create driver 
driver = webdriver.Chrome(ChromeDriverManager().install())

# Go to GOT Character Wiki
page_url = "https://en.wikipedia.org/wiki/List_of_Game_of_Thrones_characters"
driver.get(page_url)

# Find and assemble list of main and side characters 
main_characters = driver.find_elements(By.CLASS_NAME, 'mw-headline')[5:47]
side_characters = driver.find_elements(By.CLASS_NAME, 'mw-redirect')[107:193]
characters = main_characters + side_characters

# Convert selenium output to strings then transform list into DF
char_list = []
for elm in characters:
    char_list.append(elm.text)

# Create and modify character first-names and full names in DF 
char_df = pd.DataFrame(char_list,columns=['characters'])
char_df['character_firstname'] = char_df['characters'].apply(lambda x: x.split(' ', 1)[0])
char_df.character_firstname[0] = 'Ned'
char_df.character_firstname[19] = 'Sam'
char_df.character_firstname[6] = 'Mormont'
char_df.character_firstname[13] = 'Bran'
char_df.characters[19] = 'Sam Tarly'

### Functions

In [None]:
def ner(file_name):
    """
    Function to process text from a text file (.txt) using Spacy.
    
    Params:
    file_name -- name of txt file as string 
    
    Returns:
    a processed doc file using Spacy English language model 
    
    """
    # Load spacy English language model
    NER = spacy.load('en_core_web_sm')
    NER.max_length = 2400382
    book_text = open(book).read()
    book_doc = NER(book_text)
    return book_doc



def get_ne_list_per_sentence(spacy_doc):
    """
    Get a list of entites per sentence of a Spacy document and store in a dataframe.
    
    Params:
    spacy_doc -- a Spacy processed document
    
    Returns:
    a dataframe containing the sentences and corresponding list of recognised named entities in the sentences
    """
    
    # Parse book by sentence and store each respective named entity in corresponding dictionary 'key: values'
    sent_entity_df = []

    for sent in book_doc.sents:
        entity_list = [ent.text for ent in sent.ents]
        sent_entity_df.append({"sentence": sent, "entities": entity_list})

    sent_entity_df = pd.DataFrame(sent_entity_df)

    return sent_entity_df


def filter_entity(ent_list, char_df):
    """
    Function to filter out non-character entities.
    
    Params:
    ent_list -- list of entities to be filtered
    character_df -- a dataframe contain characters' names and characters' first names
    
    Returns:
    a list of entities that are characters (matching by names or first names).
    
    """
    return [ent for ent in ent_list
            if ent in list(char_df.characters)
            or ent in list(char_df.character_firstname)]


def create_relationships(df, window_size):
    
    """
    Create a dataframe of relationships based on the df dataframe (containing lists of chracters per sentence) and the  window size of n sentences.
    
    Params:
    df -- a dataframe containing a column called character_entities with the list of chracters for each sentence of a document.
    window_size -- size of the windows (number of sentences) for creating relationships between two adjacent characters in the text.
    
    Returns:
    a relationship dataframe containing 3 columns: source, target, value.
    
    """
    
    # Window size and relationship dictionary instantiation 
    relationships = []

    for i in range(sent_entity_df_filtered.index[-1]):
        end_i = min(i+5, sent_entity_df_filtered.index[-1])
        char_list = sum((sent_entity_df_filtered.loc[i: end_i].character_entities), [])

        # Remove diplicate characters next to each other
        char_unique = [char_list[i] for i in range(len(char_list))
                      if (i == 0) or char_list[i] != char_list[i-1]] 
        if len(char_unique) > 1:
            for idx, a in enumerate(char_unique[:-1]):
                b = char_unique[idx + 1]
                relationships.append({"source": a, "target": b})
           
    # Transform newly created relationship dictionary into pd DF 
    relationship_df = pd.DataFrame(relationships)
    relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)

    # Find relationship weight of each character and select the first 250 heavily weighted relationships 
    relationship_df["value"] = 1 
    relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()
    relationship_df = relationship_df.iloc[:233, :]
                
    return relationship_df


def centrality_dev(G):
    
    # Degree of centrality
    degree_dict = nx.degree_centrality(G)
    degree_df = pd.DataFrame.from_dict(degree_dict, orient='index', columns=['degree centrality'])

    # Betweenness centrality 
    betweenness_dict = nx.betweenness_centrality(G)
    betweenness_df = pd.DataFrame.from_dict(betweenness_dict, orient='index', columns=['betweenness centrality'])
    
    # Closeness centrality
    closeness_dict = nx.closeness_centrality(G)
    closeness_df = pd.DataFrame.from_dict(closeness_dict, orient='index', columns=['closeness centrality'])
        
    return [degree_dict, betweenness_dict, closeness_dict]