### Matching characters from screenplay and cast data

In [None]:
import pandas as pd

In [None]:
df_screenplays = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_merged_characters.csv')

In [None]:
df_screenplays

Unnamed: 0,title,label,text
0,A History of Violence,character,MORNING
1,A History of Violence,character,A
2,A History of Violence,character,BILLY
3,A History of Violence,character,LELAND
4,A History of Violence,character,BILLY
...,...,...,...
670733,You Can Count on Me,character,SAMMY
670734,You Can Count on Me,character,TERRY
670735,You Can Count on Me,character,TERRY
670736,You Can Count on Me,character,TERRY


In [None]:
df_characters = df_screenplays.drop(columns = ['label'])

In [None]:
df_characters

Unnamed: 0,title,text
0,A History of Violence,MORNING
1,A History of Violence,A
2,A History of Violence,BILLY
3,A History of Violence,LELAND
4,A History of Violence,BILLY
...,...,...
670733,You Can Count on Me,SAMMY
670734,You Can Count on Me,TERRY
670735,You Can Count on Me,TERRY
670736,You Can Count on Me,TERRY


In [None]:
df_characters['text'] = df_characters['text'].astype(str).str.replace(r'[^a-zA-Z\s]', '', regex=True)
df_characters['text'] = df_characters['text'].str.split().str.join(' ')
df_characters['text'] = df_characters['text'].str.replace(
    r'\b(CONTD|CONT|CON|VO|VOICE|O.S.|OS|OFFSCREEN|SFX|CONTINUED|INSERT)\b', '', regex=True)
df_characters['text'] = df_characters['text'].str.strip().str.capitalize()
df_characters['dialogues_count'] = df_characters.groupby(['title', 'text'])['text'].transform('count')
df_characters = df_characters.drop_duplicates(subset = ['title', 'text']).reset_index(drop = True)

In [None]:
df_characters

Unnamed: 0,title,text,dialogues_count
0,A History of Violence,Morning,3
1,A History of Violence,A,2
2,A History of Violence,Billy,23
3,A History of Violence,Leland,15
4,A History of Violence,Night,9
...,...,...,...
49987,You Can Count on Me,Janie,9
49988,You Can Count on Me,Rudy sr oc,2
49989,You Can Count on Me,Rudy sr,13
49990,You Can Count on Me,St cop,1


In [None]:
df_characters['dialogues_percent'] = (df_characters['dialogues_count'] / df_characters.groupby('title')['dialogues_count'].transform('sum') * 100).round(2)

In [None]:
df_characters

Unnamed: 0,title,text,dialogues_count,dialogues_percent
0,A History of Violence,Morning,3,0.40
1,A History of Violence,A,2,0.26
2,A History of Violence,Billy,23,3.04
3,A History of Violence,Leland,15,1.98
4,A History of Violence,Night,9,1.19
...,...,...,...,...
49987,You Can Count on Me,Janie,9,1.10
49988,You Can Count on Me,Rudy sr oc,2,0.24
49989,You Can Count on Me,Rudy sr,13,1.59
49990,You Can Count on Me,St cop,1,0.12


Remove less popular characters

In [None]:
main_characters = df_characters[df_characters['dialogues_percent'] >= 2]
main_characters = main_characters.rename(columns={'text': 'character'})
main_characters = main_characters.sort_values(by = ['title', 'dialogues_percent'], ascending = [True, False]).reset_index(drop = True)
main_characters.columns = main_characters.columns.str.upper()

In [None]:
main_characters

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,"10,000 BC",Dleh,85,35.56
1,"10,000 BC",Tictic,45,18.83
2,"10,000 BC",Nakudu,31,12.97
3,"10,000 BC",Baku,23,9.62
4,"10,000 BC",Evolet,19,7.95
...,...,...,...,...
9474,xXx,Gibbons,64,11.41
9475,xXx,Petra,64,11.41
9476,xXx,Slovo,16,2.85
9477,xXx,Sam tannick,12,2.14


In [None]:
main_characters.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/main_characters.csv', index=False, encoding="utf-8-sig")

In [None]:
cast_data = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/actors_tmdb_details.csv')

In [None]:
cast_data

Unnamed: 0,TITLE,YEAR,MOVIE_ID,ACTOR_NAME,CHARACTER,GENDER,AGE_AT_RELEASE,ACTOR_ID
0,"10,000 BC",2008,7840,Steven Strait,D’Leh,Man,22,54815
1,"10,000 BC",2008,7840,Camilla Belle,Evolet,Woman,22,38670
2,"10,000 BC",2008,7840,Cliff Curtis,Tic-Tic,Man,40,7248
3,"10,000 BC",2008,7840,Nathanael Baring,Baku,Man,18,54805
4,"10,000 BC",2008,7840,Mo Zinal,Ka’Ren,Man,Unknown,54806
...,...,...,...,...,...,...,...,...
25869,xXx,2002,7451,Ted Maynard,James Tannick,Man,Unknown,730454
25870,xXx,2002,7451,Joe Bucaro III,Virg,Man,38,51302
25871,xXx,2002,7451,Chris Gann,T.J.,Man,30,154837
25872,xXx,2002,7451,Martin Hub,Ivan Podrov,Man,38,1054325


In [None]:
cast_data['CHARACTER'] = cast_data['CHARACTER'].astype(str).str.replace(r'[^a-zA-Z\s]', '', regex=True)
cast_data['CHARACTER'] = cast_data['CHARACTER'].str.split().str.join(' ')

In [None]:
cast_titles = set(cast_data['TITLE'].unique().tolist())
main_titles = set(main_characters['TITLE'].unique().tolist())

only_in_cast = cast_titles - main_titles
only_in_main = main_titles - cast_titles

different_titles = only_in_cast.union(only_in_main)

print(len(only_in_cast))
print(len(only_in_main), only_in_main)
print(len(different_titles))

In [None]:
def match_characters_simple(cast_data, main_characters):
    cast_extended = cast_data.copy()
    cast_extended['MATCHED_CHARACTER'] = None
    cast_extended['DIALOGUES_COUNT'] = None
    cast_extended['DIALOGUES_PERCENT'] = None

    main_characters = main_characters.copy()
    main_characters['ADDED'] = False

    for title in cast_extended['TITLE'].unique():
        cast_subset = cast_extended[cast_extended['TITLE'] == title]
        main_subset = main_characters[main_characters['TITLE'] == title]

        if main_subset.empty:
            continue

        main_subset_clean = main_subset['CHARACTER'].astype(str).str.lower().str.strip()

        for cast_id, cast_row in cast_subset.iterrows():
            if not isinstance(cast_row['CHARACTER'], str):
                continue
            cast_clean = cast_row['CHARACTER'].lower().strip()

            exact = main_subset.loc[main_subset_clean == cast_clean]
            if not exact.empty:
                idx = exact.index[0]
                main_characters.loc[idx, 'ADDED'] = True
                row = main_characters.loc[idx]

                cast_extended.loc[cast_id, 'MATCHED_CHARACTER'] = row['CHARACTER']
                cast_extended.loc[cast_id, 'DIALOGUES_COUNT'] = row['DIALOGUES_COUNT']
                cast_extended.loc[cast_id, 'DIALOGUES_PERCENT'] = row['DIALOGUES_PERCENT']
                continue

            cast_words = set(cast_clean.split())
            for idx, row in main_subset.iterrows():
                if main_characters.loc[idx, 'ADDED']:
                    continue
                main_words = set(str(row['CHARACTER']).lower().strip().split())
                if cast_words & main_words:
                    main_characters.loc[idx, 'ADDED'] = True
                    cast_extended.loc[cast_id, 'MATCHED_CHARACTER'] = row['CHARACTER']
                    cast_extended.loc[cast_id, 'DIALOGUES_COUNT'] = row['DIALOGUES_COUNT']
                    cast_extended.loc[cast_id, 'DIALOGUES_PERCENT'] = row['DIALOGUES_PERCENT']
                    break

    return cast_extended, main_characters


In [None]:
cast_extended, main_characters = match_characters_simple(cast_data, main_characters)

In [None]:
cast_extended

Unnamed: 0,TITLE,YEAR,MOVIE_ID,ACTOR_NAME,CHARACTER,GENDER,AGE_AT_RELEASE,ACTOR_ID,MATCHED_CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,"10,000 BC",2008,7840,Steven Strait,DLeh,Man,22,54815,Dleh,85,35.56
1,"10,000 BC",2008,7840,Camilla Belle,Evolet,Woman,22,38670,Evolet,19,7.95
2,"10,000 BC",2008,7840,Cliff Curtis,TicTic,Man,40,7248,Tictic,45,18.83
3,"10,000 BC",2008,7840,Nathanael Baring,Baku,Man,18,54805,Baku,23,9.62
4,"10,000 BC",2008,7840,Mo Zinal,KaRen,Man,Unknown,54806,Karen,13,5.44
...,...,...,...,...,...,...,...,...,...,...,...
25869,xXx,2002,7451,Ted Maynard,James Tannick,Man,Unknown,730454,Sam tannick,12,2.14
25870,xXx,2002,7451,Joe Bucaro III,Virg,Man,38,51302,,,
25871,xXx,2002,7451,Chris Gann,TJ,Man,30,154837,,,
25872,xXx,2002,7451,Martin Hub,Ivan Podrov,Man,38,1054325,,,


In [None]:
main_characters_nan = main_characters[main_characters['ADDED'] == False]

In [None]:
main_characters_nan

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT,ADDED
16,12 Strong,Jean,17,2.23,False
21,12 Years a Slave,Celeste,23,4.56,False
23,12 Years a Slave,Treach,18,3.57,False
37,1408,Lily,24,5.31,False
38,1408,I,23,5.09,False
...,...,...,...,...,...
9457,mid90s,Jorge,64,14.04,False
9466,"tick, tick... BOOM!",Jon,266,38.78,False
9470,"tick, tick... BOOM!",Jon on phone,16,2.33,False
9475,xXx,Petra,64,11.41,False


In [None]:
cast_extended_filtered = cast_extended[cast_extended['MATCHED_CHARACTER'].notna()]

In [None]:
cast_extended_filtered = cast_extended_filtered.drop(columns=['MOVIE_ID', 'YEAR', 'ACTOR_NAME', 'ACTOR_ID', 'CHARACTER'])

In [None]:
cast_extended_filtered.rename(columns = {'MATCHED_CHARACTER': 'CHARACTER'}, inplace = True)

In [None]:
cast_extended_filtered

Unnamed: 0,TITLE,GENDER,AGE_AT_RELEASE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,"10,000 BC",Man,22,Dleh,85,35.56
1,"10,000 BC",Woman,22,Evolet,19,7.95
2,"10,000 BC",Man,40,Tictic,45,18.83
3,"10,000 BC",Man,18,Baku,23,9.62
4,"10,000 BC",Man,Unknown,Karen,13,5.44
...,...,...,...,...,...,...
25854,xXx,Man,35,Xander,261,46.52
25856,xXx,Man,36,Yorgi,66,11.76
25857,xXx,Man,54,Gibbons,64,11.41
25858,xXx,Man,26,Shavers,12,2.14


In [None]:
cast_extended_filtered = cast_extended_filtered[['TITLE', 'CHARACTER', 'GENDER', 'AGE_AT_RELEASE', 'DIALOGUES_COUNT', 'DIALOGUES_PERCENT']]

In [None]:
cast_extended_filtered

Unnamed: 0,TITLE,CHARACTER,GENDER,AGE_AT_RELEASE,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,"10,000 BC",Dleh,Man,22,85,35.56
1,"10,000 BC",Evolet,Woman,22,19,7.95
2,"10,000 BC",Tictic,Man,40,45,18.83
3,"10,000 BC",Baku,Man,18,23,9.62
4,"10,000 BC",Karen,Man,Unknown,13,5.44
...,...,...,...,...,...,...
25854,xXx,Xander,Man,35,261,46.52
25856,xXx,Yorgi,Man,36,66,11.76
25857,xXx,Gibbons,Man,54,64,11.41
25858,xXx,Shavers,Man,26,12,2.14


In [None]:
main_characters_nan.drop(columns=['ADDED'], inplace = True)

In [None]:
main_characters_nan

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT
16,12 Strong,Jean,17,2.23
21,12 Years a Slave,Celeste,23,4.56
23,12 Years a Slave,Treach,18,3.57
37,1408,Lily,24,5.31
38,1408,I,23,5.09
...,...,...,...,...
9457,mid90s,Jorge,64,14.04
9466,"tick, tick... BOOM!",Jon,266,38.78
9470,"tick, tick... BOOM!",Jon on phone,16,2.33
9475,xXx,Petra,64,11.41


In [None]:
! pip install gender-guesser

Collecting gender-guesser
  Downloading gender_guesser-0.4.0-py2.py3-none-any.whl.metadata (3.0 kB)
Downloading gender_guesser-0.4.0-py2.py3-none-any.whl (379 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m379.3/379.3 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gender-guesser
Successfully installed gender-guesser-0.4.0


In [None]:
import gender_guesser.detector as gender_detector

In [None]:
main_characters_nan['GENDER'] = 'Unknown'

In [None]:
main_characters_nan

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT,GENDER
16,12 Strong,Jean,17,2.23,Unknown
21,12 Years a Slave,Celeste,23,4.56,Unknown
23,12 Years a Slave,Treach,18,3.57,Unknown
37,1408,Lily,24,5.31,Unknown
38,1408,I,23,5.09,Unknown
...,...,...,...,...,...
9457,mid90s,Jorge,64,14.04,Unknown
9466,"tick, tick... BOOM!",Jon,266,38.78,Unknown
9470,"tick, tick... BOOM!",Jon on phone,16,2.33,Unknown
9475,xXx,Petra,64,11.41,Unknown


In [None]:
d = gender_detector.Detector()
for id, row in main_characters_nan.iterrows():
  if row['GENDER']=='Unknown':
    name = str(row['CHARACTER']).split()[0]
    gender = d.get_gender(name)
    if gender == 'mostly_male' or gender == 'male':
      main_characters_nan.loc[id, 'GENDER'] = 'Man'
    if gender == 'mostly_female' or gender == 'female':
      main_characters_nan.loc[id, 'GENDER'] = 'Woman'
    if gender == 'unknown':
      main_characters_nan.loc[id, 'GENDER'] = 'Unknown'
    if gender == 'andy':
      main_characters_nan.loc[id, 'GENDER'] = 'Unknown'

In [None]:
main_characters_nan_fill = main_characters_nan[(main_characters_nan['GENDER']=='Unknown') &
 (main_characters_nan['DIALOGUES_PERCENT'] > 10)]

In [None]:
main_characters_filled = main_characters_nan[main_characters_nan['GENDER'] != 'Unknown']

In [None]:
main_characters_filled

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT,GENDER
16,12 Strong,Jean,17,2.23,Man
21,12 Years a Slave,Celeste,23,4.56,Woman
37,1408,Lily,24,5.31,Woman
39,1408,Lily over videolink,17,3.76,Woman
46,17 Again,Ed,125,15.74,Man
...,...,...,...,...,...
9444,Zookeeper,Wolf,15,2.40,Man
9457,mid90s,Jorge,64,14.04,Man
9466,"tick, tick... BOOM!",Jon,266,38.78,Man
9470,"tick, tick... BOOM!",Jon on phone,16,2.33,Man


In [None]:
cast_extended_filtered.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/cast_extended_filtered.csv', index=False, encoding="utf-8-sig")

In [None]:
main_characters_nan_fill.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/main_characters_nan_fill.csv', index=False, encoding="utf-8-sig")

Manual Character Completion (if possible)

In [None]:
main_characters_filled.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/main_characters_filled.csv', index=False, encoding="utf-8-sig")

In [None]:
main_characters_filled = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/main_characters_filled.csv')

In [None]:
main_characters_filled

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT,GENDER
0,12 Strong,Jean,17,2.23,Man
1,12 Years a Slave,Celeste,23,4.56,Woman
2,1408,Lily,24,5.31,Woman
3,1408,Lily over videolink,17,3.76,Woman
4,17 Again,Ed,125,15.74,Man
...,...,...,...,...,...
1025,Zookeeper,Wolf,15,2.40,Man
1026,mid90s,Jorge,64,14.04,Man
1027,"tick, tick... BOOM!",Jon,266,38.78,Man
1028,"tick, tick... BOOM!",Jon on phone,16,2.33,Man


In [None]:
main_characters_nan_filled = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/main_characters_nan_fill.csv')

In [None]:
main_characters_nan_filled = main_characters_nan_filled[main_characters_nan_filled['GENDER'] != 'Unknown']

In [None]:
main_characters_nan_filled

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT,GENDER
3,47 Ronin,Oishi,83,25.23,Man
5,Aftershock,Briggs,52,15.34,Man
6,All the Money in the World,Chace,136,22.11,Man
7,Autumn in New York,Wills,299,35.68,Man
10,Before Midnight,Celine,326,40.00,Woman
...,...,...,...,...,...
179,Tron: Legacy,Mega,63,11.01,Man
180,Tron: Legacy,Plexor,61,10.66,Man
183,V for Vendetta,Leader,117,10.62,Man
186,WALL-E,Captain,62,25.41,Man


In [None]:
main_characters_all = pd.concat([main_characters_filled, main_characters_nan_filled], ignore_index=True)

In [None]:
main_characters_all

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT,GENDER
0,12 Strong,Jean,17,2.23,Man
1,12 Years a Slave,Celeste,23,4.56,Woman
2,1408,Lily,24,5.31,Woman
3,1408,Lily over videolink,17,3.76,Woman
4,17 Again,Ed,125,15.74,Man
...,...,...,...,...,...
1112,Tron: Legacy,Mega,63,11.01,Man
1113,Tron: Legacy,Plexor,61,10.66,Man
1114,V for Vendetta,Leader,117,10.62,Man
1115,WALL-E,Captain,62,25.41,Man


In [None]:
cast_extended_filtered

Unnamed: 0,TITLE,CHARACTER,GENDER,AGE_AT_RELEASE,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,"10,000 BC",Dleh,Man,22,85,35.56
1,"10,000 BC",Evolet,Woman,22,19,7.95
2,"10,000 BC",Tictic,Man,40,45,18.83
3,"10,000 BC",Baku,Man,18,23,9.62
4,"10,000 BC",Karen,Man,Unknown,13,5.44
...,...,...,...,...,...,...
25854,xXx,Xander,Man,35,261,46.52
25856,xXx,Yorgi,Man,36,66,11.76
25857,xXx,Gibbons,Man,54,64,11.41
25858,xXx,Shavers,Man,26,12,2.14


In [None]:
main_characters_all_copy = main_characters_all.copy()
main_characters_all_copy['AGE_AT_RELEASE'] = 'Unknown'

In [None]:
main_characters_all_copy

Unnamed: 0,TITLE,CHARACTER,DIALOGUES_COUNT,DIALOGUES_PERCENT,GENDER,AGE_AT_RELEASE
0,12 Strong,Jean,17,2.23,Man,Unknown
1,12 Years a Slave,Celeste,23,4.56,Woman,Unknown
2,1408,Lily,24,5.31,Woman,Unknown
3,1408,Lily over videolink,17,3.76,Woman,Unknown
4,17 Again,Ed,125,15.74,Man,Unknown
...,...,...,...,...,...,...
1112,Tron: Legacy,Mega,63,11.01,Man,Unknown
1113,Tron: Legacy,Plexor,61,10.66,Man,Unknown
1114,V for Vendetta,Leader,117,10.62,Man,Unknown
1115,WALL-E,Captain,62,25.41,Man,Unknown


In [None]:
cast_extended_final = pd.concat([cast_extended_filtered, main_characters_all_copy], ignore_index = True)

In [None]:
cast_extended_final

Unnamed: 0,TITLE,CHARACTER,GENDER,AGE_AT_RELEASE,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,"10,000 BC",Dleh,Man,22,85,35.56
1,"10,000 BC",Evolet,Woman,22,19,7.95
2,"10,000 BC",Tictic,Man,40,45,18.83
3,"10,000 BC",Baku,Man,18,23,9.62
4,"10,000 BC",Karen,Man,Unknown,13,5.44
...,...,...,...,...,...,...
8541,Tron: Legacy,Mega,Man,Unknown,63,11.01
8542,Tron: Legacy,Plexor,Man,Unknown,61,10.66
8543,V for Vendetta,Leader,Man,Unknown,117,10.62
8544,WALL-E,Captain,Man,Unknown,62,25.41


In [None]:
cast_extended_final = cast_extended_final.sort_values(by = ['TITLE', 'DIALOGUES_PERCENT'], ascending = [True, False]).reset_index(drop = True)

In [None]:
cast_extended_final

Unnamed: 0,TITLE,CHARACTER,GENDER,AGE_AT_RELEASE,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,"10,000 BC",Dleh,Man,22,85,35.56
1,"10,000 BC",Tictic,Man,40,45,18.83
2,"10,000 BC",Nakudu,Man,Unknown,31,12.97
3,"10,000 BC",Baku,Man,18,23,9.62
4,"10,000 BC",Evolet,Woman,22,19,7.95
...,...,...,...,...,...,...
8541,xXx,Yorgi,Man,36,66,11.76
8542,xXx,Gibbons,Man,54,64,11.41
8543,xXx,Petra,Woman,Unknown,64,11.41
8544,xXx,Shavers,Man,26,12,2.14


In [None]:
cast_extended_final.insert(0, 'ID', range(len(cast_extended_final)))

In [None]:
cast_extended_final

Unnamed: 0,ID,TITLE,CHARACTER,GENDER,AGE_AT_RELEASE,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,0,"10,000 BC",Dleh,Man,22,85,35.56
1,1,"10,000 BC",Tictic,Man,40,45,18.83
2,2,"10,000 BC",Nakudu,Man,Unknown,31,12.97
3,3,"10,000 BC",Baku,Man,18,23,9.62
4,4,"10,000 BC",Evolet,Woman,22,19,7.95
...,...,...,...,...,...,...,...
8541,8541,xXx,Yorgi,Man,36,66,11.76
8542,8542,xXx,Gibbons,Man,54,64,11.41
8543,8543,xXx,Petra,Woman,Unknown,64,11.41
8544,8544,xXx,Shavers,Man,26,12,2.14


In [None]:
cast_extended_final.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/cast_data_final.csv', index=False, encoding="utf-8-sig")

In [None]:
cast_extended_final = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/cast_data_final.csv')

In [None]:
cast_extended_final

Unnamed: 0,ID,TITLE,CHARACTER,GENDER,AGE_AT_RELEASE,DIALOGUES_COUNT,DIALOGUES_PERCENT
0,0,"10,000 BC",Dleh,Man,22,85,35.56
1,1,"10,000 BC",Tictic,Man,40,45,18.83
2,2,"10,000 BC",Nakudu,Man,Unknown,31,12.97
3,3,"10,000 BC",Baku,Man,18,23,9.62
4,4,"10,000 BC",Evolet,Woman,22,19,7.95
...,...,...,...,...,...,...,...
8541,8541,xXx,Yorgi,Man,36,66,11.76
8542,8542,xXx,Gibbons,Man,54,64,11.41
8543,8543,xXx,Petra,Woman,Unknown,64,11.41
8544,8544,xXx,Shavers,Man,26,12,2.14


Selecting dialogues only from filtered characters + column transformation

In [1]:
import pandas as pd

In [2]:
df_screenplays_dialogues = pd.read_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/screenplay_annotated_merged.csv')

In [3]:
df_screenplays_dialogues

Unnamed: 0,title,label,text
0,A History of Violence,character,MORNING
1,A History of Violence,dialogue,A battered old motel by the side of the road T...
2,A History of Violence,character,A
3,A History of Violence,dialogue,large pathetic green papier mache dinosaur som...
4,A History of Violence,character,BILLY
...,...,...,...
1338832,You Can Count on Me,dialogue,Come on Sammy Look at me Look at me
1338833,You Can Count on Me,character,TERRY
1338834,You Can Count on Me,dialogue,Hey Sammy Remember when we were kids remember ...
1338835,You Can Count on Me,character,SAMMY


In [None]:
len(df_screenplays_dialogues)

1338837

In [None]:
valid_charcters = cast_extended_final.groupby('TITLE')['CHARACTER'].apply(set).to_dict()

In [None]:
for title, char in list(valid_charcters.items())[:4]:
  print(title, char)

10,000 BC {'Old mother', 'Evolet', 'Dleh', 'Tictic', 'Karen', 'Nakudu', 'Narrator', 'Baku'}
12 Strong {'Milo', 'Essex', 'Spencer', 'Nelson into radio', 'Nelson', 'Diller', 'Dostum', 'Jean', 'Coffers', 'Michaels'}
12 Years a Slave {'Ford', 'Freeman', 'Tibeats', 'Mistress epps', 'Solomon', 'Bass', 'Hamilton', 'Celeste', 'Epps', 'Patsey', 'Brown'}
127 Hours {'Megan', 'Aron', 'Kristi', 'Aron the host', 'Rana'}


In [None]:
id_to_keep = []
for i in range(len(df_screenplays_dialogues)):
    row = df_screenplays_dialogues.iloc[i]
    title = row['title']
    label = row['label']
    text = str(row['text']).capitalize()
    if label == 'character':
        if title in valid_charcters and text in valid_charcters[title]:
            id_to_keep.append(i)
            if i + 1 < len(df_screenplays_dialogues):
                id_to_keep.append(i + 1)
    else:
        continue

In [None]:
len(id_to_keep)

980642

In [None]:
df_dialogues_filtered = df_screenplays_dialogues.iloc[id_to_keep].reset_index(drop=True)

In [None]:
rows = df_dialogues_filtered[['title', 'label', 'text']].to_numpy()

row_list = []
i = 0

while i < len(rows):
    title, label, text = rows[i]

    if label == 'character':
        character = text
        dialogue = None

        if i + 1 < len(rows):
            next_title, next_label, next_text = rows[i + 1]
            if next_label == 'dialogue' and next_title == title:
                dialogue = next_text
                i += 2

                row_list.append({
                    'title': title,
                    'character': character.capitalize(),
                    'dialogue': dialogue
                })
                continue

        row_list.append({
            'title': title,
            'character': character,
            'dialogue': dialogue
        })

    i += 1


In [None]:
df_dialogues_filtered_final = pd.DataFrame(row_list)

In [None]:
df_dialogues_filtered_final

Unnamed: 0,title,character,dialogue
0,A History of Violence,Billy,You think maybe we could just turn around
1,A History of Violence,Billy,shorter chunkier wearing jeans and an old John...
2,A History of Violence,Billy,Yeah I guess Im just tired though
3,A History of Violence,Billy,Johnny Leland humming along with Johnny slides...
4,A History of Violence,Billy,Yuh
...,...,...,...
490866,You Can Count on Me,Sammy,Please dont go till you know where youre going...
490867,You Can Count on Me,Terry,I do know where Im going Im going to Worcester...
490868,You Can Count on Me,Terry,Come on Sammy Look at me Look at me
490869,You Can Count on Me,Terry,Hey Sammy Remember when we were kids remember ...


In [None]:
df_dialogues_filtered_final.to_csv('/content/drive/MyDrive/PROJECT/movie_screenplays/dialogues_data_final.csv', index=False, encoding="utf-8-sig")