In [83]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Dataframe with script lines

We are going to build a dataframe in which each row represents an script line. We will also define some characteristics such as the episode number, the season number and the character that says it.
\
To do this, we will go through the text file with the script line by line and selecting only the information we are interested in.

In [99]:
%%time
# Dataframe with sentences
df = pd.DataFrame()
scene_shift_keywords = ['scene shifts', 'cut to', '---']

with open('../data/raw/script_raw.txt', 'r') as f:
    
    for line in tqdm(f.readlines()):

        if line == "\n":
            continue
        elif np.sum([keyword in line.lower() for keyword in scene_shift_keywords]) > 0:
            current_scene += 1
            
        elif not line[0].isalpha() or line.endswith(']\n'):
            continue
        
        elif line.startswith('SEASON'):
            current_season = line.split()[1]
        elif line.startswith('EPISODE'):
            episode_name = ' '.join(line.split()[3:])
            current_episode = line.split()[1]
            current_scene = 1

        elif line.isupper():
            current_scene += 1
        elif ':' in line or '(' in line:
            if '(' in line:
                split_char = '('
            else:
                split_char = ':'
                
            character_name = line.split(split_char)[0]
        
            dialog = line.split(split_char)[1].replace('\n', '')
            if split_char == '(':
                dialog = '(' + dialog
            
            
            df = df.append({
                'season':int(current_season),
                'episode': int(current_episode),
                'character': character_name,
                'text': dialog,
                'episode_name': episode_name,
                'scene': current_scene,
                }, ignore_index=True)
            
df['episode'] = df['episode'].astype(int)
df['season'] = df['season'].astype(int)
df['scene'] = df['scene'].astype(int)
        
            
df.shape

100%|██████████| 67876/67876 [03:00<00:00, 375.47it/s]


CPU times: user 3min, sys: 748 ms, total: 3min 1s
Wall time: 3min


(25058, 6)

## Load ratings and merge with dialogues dataframe

In [138]:
ratings = pd.read_csv('../data/raw/ratings.csv')
ratings.head()

Unnamed: 0,episode_name,rate
0,Winter Is Coming,9.1
1,The Kingsroad,8.8
2,Lord Snow,8.7
3,"Cripples, Bastards, and Broken Things",8.8
4,The Wolf and the Lion,9.1


In [139]:
df['episode_name'] = df['episode_name'].str.lower()
ratings['episode_name'] = ratings['episode_name'].str.lower()

In [143]:
final_df = pd.merge(df, ratings, on='episode_name', how='left')
final_df['episode_name'] = final_df['episode_name'].str.title()
final_df.head()

Unnamed: 0,character,episode,episode_name,scene,season,text,rate
0,WAYMAR ROYCE,1,Winter Is Coming,1,1,What d’you expect? They’re savages. One lot s...,9.1
1,WILL,1,Winter Is Coming,1,1,I’ve never seen wildlings do a thing like thi...,9.1
2,WAYMAR ROYCE,1,Winter Is Coming,1,1,How close did you get?,9.1
3,WILL,1,Winter Is Coming,1,1,Close as any man would.,9.1
4,GARED,1,Winter Is Coming,1,1,We should head back to the wall.,9.1


In [146]:
final_df.to_csv('../data/processed/got.csv', index=False)