In [2]:
import spacy
import numpy as np
import pandas as pd

In [33]:
df = pd.read_csv('cleaned_pudding_public_scripts.csv')

In [4]:
nlp = spacy.load('en_core_web_sm')
# Or use the default model, which has fewer features:
# nlp = spacy.load('en')

In [5]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [6]:
doc.ents

(Apple, U.K., $1 billion)

In [7]:
# [prop for prop in dir(doc) if not prop.startswith('_')]

In [8]:
first_word = doc[0]
type(first_word), first_word

(spacy.tokens.token.Token, Apple)

In [9]:
from spacy import displacy
displacy.render(doc, style="dep")


In [18]:
kill_bill_script = df[df.title.str.contains('Kill')].script.values[0]

In [21]:
doc = nlp(kill_bill_script)

In [22]:
# for ent in doc.ents:
#     if ent.label_ == 'GPE':
#         print(ent.text, ent.label_)

In [23]:
# from spacy import displacy
# displacy.render(doc, style="ent")

In [24]:
import warnings
warnings.filterwarnings("ignore")

In [49]:
def get_entities(row, type_entity):
    doc = nlp(row.script)
    entities =[]
    # entities = [ent for ent in doc.ents if ent.label_ == type_entity]
    for ent in doc.ents:
        if ent.label_ == type_entity:
            entities.append(ent.text)
    return entities
    
subset_df = df[df.title.str.contains('Twilight')][0:1]

subset_df['identified_entities'] = subset_df.apply(get_entities, axis=1, type_entity='PERSON')

In [38]:
character_list_df = pd.read_csv('character_list5.csv', encoding='latin-1')
metadata_df = pd.read_csv('meta_data7.csv', encoding='latin-1')
character_mapping_df = pd.read_csv('character_mapping.csv', encoding='latin-1')

In [39]:
merged_metadata_character_df = character_list_df.merge(metadata_df, on='script_id', how='left')

In [40]:
merged_metadata_character_df.head()

Unnamed: 0,script_id,imdb_character_name,words,gender,age,imdb_id,title,year,gross,lines_data
0,280,betty,311,f,35.0,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
1,280,carolyn johnson,873,f,,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
2,280,eleanor,138,f,,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
3,280,francesca johns,2251,f,46.0,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...
4,280,madge,190,f,46.0,tt0112579,The Bridges of Madison County,1995,142.0,4332023434343443203433434334433434343434434344...


In [43]:
# subset_exploded_df = subset_df.explode('identified_entities')
# subset_exploded_df = subset_exploded_df.drop_duplicates()

# grouped_character_list = merged_filmscripts_metadata_df.groupby(['title'])['imdb_character_name'].apply(list).reset_index(name='character_list')

# grouped_merged_character_list = merged_filmscripts_metadata_df.merge(grouped_character_list, on='title', how='left')

merged_filmscripts_metadata_df = subset_df.merge(merged_metadata_character_df, on=['script_id', 'year', 'title', 'imdb_id'], how='left')


merged_filmscripts_metadata_df.head()

Unnamed: 0,imdb_id,script_id,title,year,gross_ia,link,status_code,script,script_length,identified_entities,imdb_character_name,words,gender,age,gross,lines_data
0,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",alice cullen,1200,f,22.0,344.0,1452445303446332253244454134430204464354443254...
1,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",angela,138,f,19.0,344.0,1452445303446332253244454134430204464354443254...
2,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",aro,442,m,40.0,344.0,1452445303446332253244454134430204464354443254...
3,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",bella swan,6542,f,19.0,344.0,1452445303446332253244454134430204464354443254...
4,tt1259571,5164,The Twilight Saga: New Moon,2009,344.0,http://www.imsdb.com/scripts/Twilight-New-Moon...,200.0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nThe In...,244703,"[Drama\nFamily, Fantasy\nFilm-Noir, Melissa Ro...",charlie swan,878,m,43.0,344.0,1452445303446332253244454134430204464354443254...
