In [43]:
import pandas as pd
import re
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK data (run once if needed)
nltk.download('punkt')        # For tokenization
nltk.download('stopwords')    # For stopwords
nltk.download('wordnet')      # For lemmatization

# Initialize necessary NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()  # For Lemmatization
stemmer = PorterStemmer()         # For Stemming (optional)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arthu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
movies = pd.read_csv('../data/processed/merged_movies.csv')
characters = pd.read_csv('../data/MovieSummaries/character.metadata.tsv', sep = '\t', header = None)

# ignore name_clusters df
name_clusters = pd.read_csv('../data/MovieSummaries/name.clusters.txt', sep = '\t', header = None)
summaries = pd.read_csv('../data/processed/summaries_preprocessed.csv')
tv_tropes = pd.read_csv('../data/MovieSummaries/tvtropes.clusters.txt', sep = '\t', header = None)

In [45]:
characters.columns = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 'character_name', 'actor_birth', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 'actor_age', 'freebase_character_map', 'freebase_character_id', 'freebase_actor_id']

In [46]:
# removing freebase_character_ids gives approx same resulting df size
characters = characters.dropna(subset=['wikipedia_movie_id', 'character_name'])

Compare size of character, movie, and movie summary datasets.
Keep only the movies which index is common between the 3 sets

In [47]:
print("Number of movies:", movies.shape[0])
print("Number of summaries:", summaries.shape[0])

# shows 
n_movie_in_characters = characters.drop_duplicates(subset='wikipedia_movie_id', keep='first')
print("Number of movies where we know the characters:", n_movie_in_characters.shape[0])

Number of movies: 8954
Number of summaries: 42303
Number of movies where we know the characters: 32571


In [48]:
common_index = movies['wikipedia_movie_id'].isin(summaries['wikipedia_movie_id']) & movies['wikipedia_movie_id'].isin(characters['wikipedia_movie_id'])
common_index2 = movies['wikipedia_movie_id'].isin(characters['wikipedia_movie_id'])
common_index3 = movies['wikipedia_movie_id'].isin(summaries['wikipedia_movie_id'])
filtered_movies_summaries_characters = movies[common_index]
filtered_movies_characters = movies[common_index2]
filtered_movies_summaries = movies[common_index3]

filtered_movies -> have two filtered datasets on characters and on summaries+characters since summaries currently not used?


In [49]:
tv_tropes.columns = ['trope', 'details']
tv_tropes['details'] = tv_tropes['details'].apply(eval)
tv_tropes = tv_tropes.join(pd.json_normalize(tv_tropes['details'])).drop(columns=['details'])

In [50]:
tv_tropes.columns = [
    'trope',
    'character_name',          # Change `char` to `character_name`
    'movie_name',              # Change `movie` to `movie_name`
    'freebase_movie_id',       # Change `id` to `freebase_movie_id`
    'actor_name'               # Change `actor` to `actor_name`
]

In [51]:
trope_counts = tv_tropes['trope'].value_counts().reset_index()
trope_counts.columns = ['trope', 'count']

Leave clusters with archetypes for now

Add tropes to character df

In [52]:
merged_characters = characters.merge(tv_tropes[['character_name', 'trope', 'actor_name']],
                                      on=['character_name', 'actor_name'],
                                      how='left',
                                      indicator=True)

Ignore part to categorize character from plot (virtually impossible)

In [53]:
# check uniqueness for both merges ??

# Check unique entries in filtered_movies
unique_filtered_movies = filtered_movies_summaries_characters['wikipedia_movie_id'].nunique()
total_filtered_movies = filtered_movies_summaries_characters.shape[0]

# Check unique entries in summaries
unique_summaries = summaries['wikipedia_movie_id'].nunique()
total_summaries = summaries.shape[0]

print(f"Unique entries in filtered_movies: {unique_filtered_movies}, Total entries: {total_filtered_movies}")
print(f"Unique entries in summaries: {unique_summaries}, Total entries: {total_summaries}")

Unique entries in filtered_movies: 6927, Total entries: 6989
Unique entries in summaries: 42303, Total entries: 42303


In [54]:
filtered_movies = filtered_movies_summaries_characters.drop_duplicates()

In [None]:
filtered_movies

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv..."
2,171005,/m/016ywb,Henry V,1989.0,10161099.0,137.0,['English Language'],['United Kingdom'],"['Costume drama', 'War film', 'Epic', 'Period ..."
3,77856,/m/0kcn7,Mary Poppins,1964.0,102272727.0,139.0,['English Language'],['United States of America'],"[""Children's/Family"", 'Musical', 'Fantasy', 'C..."
5,156558,/m/014k4y,Baby Boy,2001.0,29381649.0,123.0,['English Language'],['United States of America'],"['Crime Fiction', 'Drama', 'Coming of age']"
8,12008535,/m/02vlsqt,Rudo y Cursi,2008.0,11091868.0,103.0,['Spanish Language'],"['Mexico', 'United States of America']","['Sports', 'Drama', 'Family Drama', 'Comedy-dr..."
...,...,...,...,...,...,...,...,...,...
8948,1191380,/m/04f_y7,Wilde,1997.0,2158775.0,118.0,['English Language'],"['Kingdom of Great Britain', 'Japan', 'England...","['LGBT', 'Biography', 'Japanese Movies', 'Hist..."
8949,54540,/m/0f7hw,Coming to America,1988.0,288752301.0,117.0,['English Language'],['United States of America'],"['Romantic comedy', 'Comedy of manners', 'Dram..."
8950,7761830,/m/0kvgqb,Spaced Invaders,1990.0,15369573.0,100.0,['English Language'],['United States of America'],"['Alien Film', 'Science Fiction', 'Family Film..."
8951,1918494,/m/0660qx,State and Main,2000.0,6944471.0,106.0,"['Italian Language', 'English Language']","['France', 'United States of America']","['Parody', 'Americana', 'Comedy']"


In [56]:
characters = merged_characters

All entries in both DataFrames are unique. We can merge on 'wikipedia_movie_id'. Wait they are not unique

In [57]:
characters_with_tropes = characters[characters['trope'].notna()]

# 2. Identify movie IDs with summaries
movie_ids_with_summaries = set(summaries['wikipedia_movie_id'])

# 3. Keep characters that have either a trope or a summary
characters_with_summaries = characters[characters['wikipedia_movie_id'].isin(movie_ids_with_summaries)]

combined_characters = pd.concat([characters_with_tropes, characters_with_summaries]).drop_duplicates()
df_characters = combined_characters.drop_duplicates(subset='character_name')

print("Number of charcaters:", characters.shape[0])
print("Number of characters with either a trope or a summary:", df_characters.shape[0])

Number of charcaters: 192928
Number of characters with either a trope or a summary: 96235


In [58]:
character_counts_per_movie = df_characters.groupby('wikipedia_movie_id').size().reset_index(name='character_count')

# Display character counts per movie
print("\nNumber of characters per movie:")
print(character_counts_per_movie)


Number of characters per movie:
       wikipedia_movie_id  character_count
0                    3217               10
1                    3746               11
2                    3837               13
3                    3947                8
4                    4227                1
...                   ...              ...
20220            36724042                1
20221            36814246                5
20222            36956792               23
20223            37373877                2
20224            37501922                2

[20225 rows x 2 columns]


Use imdb dataset that only has 3 main characters per movie to filter out 'unimportant' characters

In [59]:
imdb_movies = pd.read_csv('../data/raw/imdb_5000_movies.csv')

In [60]:
imdb_movies['movie_title'] = imdb_movies['movie_title'].str.strip().str.replace(u'\xa0', '')
imdb_selected = imdb_movies[['movie_title', 'title_year', 'actor_1_name', 'actor_2_name', 'actor_3_name']]

# filtered movies -> movies where there are characters as well as summary
# inner merge? -> maybe not since we are losing movies that are in original dataset but not
# in the imdb one?
# doesn't change anything keep inner merge

merged_movies = pd.merge(
    filtered_movies, imdb_selected,
    left_on=['movie_name', 'movie_release_date'],
    right_on=['movie_title', 'title_year'],
    how='inner'
)

final_characters = df_characters.merge(merged_movies[['wikipedia_movie_id', 'actor_1_name', 'actor_2_name', 'actor_3_name']],
                                        on='wikipedia_movie_id',
                                        how='inner')

In [61]:
final_characters = final_characters[
    (final_characters['actor_name'].isin(final_characters['actor_1_name'])) |
    (final_characters['actor_name'].isin(final_characters['actor_2_name']))
]

print(f"Number of final characters: {final_characters.shape[0]}")

Number of final characters: 6715


In [90]:
final_characters

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age,freebase_character_map,freebase_character_id,freebase_actor_id,trope
3,22144721,/m/05qbckf,2010-04-26,Tony Stark,1965-04-04,M,1.750,/m/041rx,Robert Downey Jr.,45.0,/m/0j1__j1,/m/0ghbqqc,/m/016z2j,byronic_hero
5,670226,/m/031gmx,2004-06-03,Toombs,1961-11-18,M,1.880,,Nick Chinlund,42.0,/m/0k4qdz,/m/0h5pbks,/m/079mh2,bounty_hunter
6,999394,/m/03y0pn,2006-06-24,Cutler Beckett,1967-08-25,M,1.651,/m/02w7gg,Tom Hollander,38.0,/m/0k1xyw,/m/0cwtr9,/m/0755wz,corrupt_corporate_executive
7,999394,/m/03y0pn,2006-06-24,Captain Jack Sparrow,1963-06-09,M,1.780,/m/01qhm_,Johnny Depp,43.0,/m/0k1xxm,/m/01x5g2,/m/0jfx1,byronic_hero
8,999394,/m/03y0pn,2006-06-24,Norrington,1973-03-01,M,1.854,,Jack Davenport,33.0,/m/0k1xyd,/m/0cgrw9b,/m/02gqcs,officer_and_a_gentleman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26834,22427855,/m/05zkcsk,2009-01-20,Marty Buchwald,1955-08-19,M,1.750,,Peter Gallagher,53.0,/m/062x1hw,/m/0glmmwb,/m/0301yj,
26835,22427855,/m/05zkcsk,2009-01-20,Rebecca Buchwald,1953-09-10,F,1.630,/m/041rx,Amy Irving,55.0,/m/062x1j0,/m/0glmmwk,/m/030h95,
26838,22427855,/m/05zkcsk,2009-01-20,Mr. Wardlow,1950-08-11,M,,,Adam LeFevre,58.0,/m/0cg4hb0,/m/0glmmx6,/m/07hd6p,
26841,25920477,/m/0b6lqyd,2011-03-11,Colter Stevens,1980-12-19,M,1.830,/m/065b6q,Jake Gyllenhaal,30.0,/m/0cpl4h_,/m/0ggkqv7,/m/02js6_,


In [62]:
# attempt with left merge
imdb_renamed = imdb_selected.rename(columns={'movie_title': 'movie_name', 'title_year': 'movie_release_date'})


merged_movies_left = filtered_movies.merge(imdb_renamed, on=['movie_name', 'movie_release_date'], how='left', indicator=True)

In [63]:
sorted_characters = characters.sort_values(by='wikipedia_movie_id', ascending=False)
sorted_characters

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age,freebase_character_map,freebase_character_id,freebase_actor_id,trope,_merge
183853,37501922,/m/0c0m5vt,1992,John Hunter,1966-10-11,M,1.765,,Luke Perry,25.0,/m/0gyqn_q,/m/0gyqn_s,/m/01g65g,,left_only
183854,37501922,/m/0c0m5vt,1992,Craig Murphy,1969-07-28,F,1.720,/m/041rx,Alexis Arquette,22.0,/m/0h35_7c,/m/0h35_7g,/m/02zjrf,,left_only
192341,37478048,/m/0g57c0y,1996,Ajay,1959-08-29,M,1.830,/m/09m6hr,Akkineni Nagarjuna,36.0,/m/0gw442b,/m/0h18dfy,/m/06pwf6,,left_only
28890,37373877,/m/02vr316,2006-10-31,Beth Patterson,1970-02-04,F,1.600,/m/03w9bjf,Gabrielle Anwar,36.0,/m/03js7xs,/m/0gydnxw,/m/03s_y5,,left_only
28891,37373877,/m/02vr316,2006-10-31,Jennifer Jones,1968-12-22,F,1.700,,Dina Meyer,37.0,/m/04db__3,/m/0h2m2b3,/m/02n305,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101746,3217,/m/014hr,1992-10-09,Possessed Witch,1957-09-04,F,1.750,,Patricia Tallman,35.0,/m/04hzm5r,/m/0hgcnl6,/m/045kf0,,left_only
101745,3217,/m/014hr,1992-10-09,Sheila,1965-08-11,F,1.730,/m/06v41q,Embeth Davidtz,27.0,/m/0jtqtk,/m/0h36j9w,/m/034b7r,,left_only
101744,3217,/m/014hr,1992-10-09,Linda,1964-01-27,F,1.680,/m/07hwkr,Bridget Fonda,28.0,/m/0jtqtq,/m/0h2hxrl,/m/01yd8v,,left_only
101743,3217,/m/014hr,1992-10-09,Cowardly Warrior,1965-12-14,M,1.870,,Ted Raimi,26.0,/m/04hzm5x,/m/0hgcnjh,/m/07qn0,,left_only


In [64]:
# gets movies with only 3 characters max (assume they are the main characters)
char_count_per_movie = sorted_characters.groupby('wikipedia_movie_id')['character_name'].nunique()
movies_with_less_than_3_actors = char_count_per_movie[char_count_per_movie < 4].index

# Keep only those rows in the original DataFrame where the 'wikipedia_movie_id' is in the filtered list
characters_lessthan3_df = sorted_characters[sorted_characters['wikipedia_movie_id'].isin(movies_with_less_than_3_actors)]

characters_lessthan3_df

# NEED TO DO THE FILTERING IN 2 STEPS -> MOVIES WITH LESS thAN 3 CHARACTERS, THEN FILTER MOVIES THAT HAVE MORE THAN 3 BUT ARE IN IMDB SO WE CAN FILTER THERE
# ALSO MAKE DF SIMPLY OF ALL MOVIES WITH REVENUE AND SUMMARIES FOR ADAM'S ANALYSIS
# WHEN ADDING TO THE 2555 MOVIES, IDENTIFY THOSE NOT IN THE 2555 FOR THE NEXT WEB SCRAPE TO THEN MERGE ON ALL 

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,actor_age,freebase_character_map,freebase_character_id,freebase_actor_id,trope,_merge
183853,37501922,/m/0c0m5vt,1992,John Hunter,1966-10-11,M,1.765,,Luke Perry,25.0,/m/0gyqn_q,/m/0gyqn_s,/m/01g65g,,left_only
183854,37501922,/m/0c0m5vt,1992,Craig Murphy,1969-07-28,F,1.720,/m/041rx,Alexis Arquette,22.0,/m/0h35_7c,/m/0h35_7g,/m/02zjrf,,left_only
192341,37478048,/m/0g57c0y,1996,Ajay,1959-08-29,M,1.830,/m/09m6hr,Akkineni Nagarjuna,36.0,/m/0gw442b,/m/0h18dfy,/m/06pwf6,,left_only
28890,37373877,/m/02vr316,2006-10-31,Beth Patterson,1970-02-04,F,1.600,/m/03w9bjf,Gabrielle Anwar,36.0,/m/03js7xs,/m/0gydnxw,/m/03s_y5,,left_only
28891,37373877,/m/02vr316,2006-10-31,Jennifer Jones,1968-12-22,F,1.700,,Dina Meyer,37.0,/m/04db__3,/m/0h2m2b3,/m/02n305,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157659,8481,/m/02c6d,1980-06-23,Doctor Robert Elliott,1933-03-14,M,1.880,,Michael Caine,47.0,/m/0jxlgd,/m/0gygjrb,/m/0gnbw,,left_only
157658,8481,/m/02c6d,1980-06-23,Kate Miller,1931-09-30,F,1.651,,Angie Dickinson,48.0,/m/0jxlgk,/m/0h13krt,/m/02lyx4,,left_only
134768,7906,/m/026j7,1939,Thomas Jefferson 'Tom' Destry Jr.,1908-05-20,M,1.910,/m/07bch9,James Stewart,30.0,/m/0jx6h0,/m/0gz5m4t,/m/044qx,,left_only
150352,5035,/m/01l40,1956-05-11,Dr. Eric Vornoff,1882-10-20,M,1.850,/m/08hpk0,Béla Lugosi,,/m/0jvszp,/m/0h4y7bk,/m/01l3j,,left_only


In [65]:
# Take movies with less than x characters and make it one row per movie with the x character names and actor names as columns
import numpy as np

# Assuming you have your original DataFrame named df
# Group the data by wikipedia_movie_id and collect actor names
grouped = characters_lessthan3_df.groupby('wikipedia_movie_id').agg({
    'actor_name': lambda x: list(x),  # Collect actor names as a list
    'character_name': lambda x: list(x)  # Collect character names as a list
}).reset_index()


def extract_info(info_list):
    # If there are fewer than 3 items, pad with NaN
    info_list += [np.nan] * (3 - len(info_list))
    return info_list[:3]

# Apply the function to create separate actor and character columns
grouped[['actor1_name', 'actor2_name', 'actor3_name']] = pd.DataFrame(
    grouped['actor_name'].apply(extract_info).to_list(),
    index=grouped.index
)

grouped[['character1_name', 'character2_name', 'character3_name']] = pd.DataFrame(
    grouped['character_name'].apply(extract_info).to_list(),
    index=grouped.index
)

# Drop the original actor_name and character_name columns
grouped = grouped.drop(columns=['actor_name', 'character_name'])

# Merge the new columns back to the original DataFrame
result = characters_lessthan3_df.drop_duplicates('wikipedia_movie_id').merge(
    grouped, on='wikipedia_movie_id', how='left'
)

# Keep only the relevant columns
columns_to_keep = ['wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 
                   'actor1_name', 'actor2_name', 'actor3_name',
                   'character1_name', 'character2_name', 'character3_name']
result = result[columns_to_keep]

result.head()

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name
0,37501922,/m/0c0m5vt,1992,Luke Perry,Alexis Arquette,,John Hunter,Craig Murphy,
1,37478048,/m/0g57c0y,1996,Akkineni Nagarjuna,,,Ajay,,
2,37373877,/m/02vr316,2006-10-31,Gabrielle Anwar,Dina Meyer,,Beth Patterson,Jennifer Jones,
3,37322106,/m/0hzp_vq,2012-11-13,Shahrukh Khan,,,Major Samar,,
4,37090987,/m/080j6zc,1949-11-25,Shirley Temple,,,Corliss Archer,,


In [66]:
with_BO = characters_lessthan3_df.merge(movies, on='wikipedia_movie_id', how='inner')
with_BO.columns

Index(['wikipedia_movie_id', 'freebase_movie_id_x', 'movie_release_date_x',
       'character_name', 'actor_birth', 'actor_gender', 'actor_height',
       'actor_ethnicity', 'actor_name', 'actor_age', 'freebase_character_map',
       'freebase_character_id', 'freebase_actor_id', 'trope', '_merge',
       'freebase_movie_id_y', 'movie_name', 'movie_release_date_y',
       'movie_box_office_revenue', 'movie_runtime', 'movie_languages',
       'movie_countries', 'movie_genres'],
      dtype='object')

In [67]:
# Take movies with less than x characters and make it one row per movie with the x character names and actor names as columns
import numpy as np

# Assuming you have your original DataFrame named df
# Group the data by wikipedia_movie_id and collect actor names
groupedd = with_BO.groupby('wikipedia_movie_id').agg({
    'actor_name': lambda x: list(x),  # Collect actor names as a list
    'character_name': lambda x: list(x)  # Collect character names as a list
}).reset_index()


def extract_info(info_list):
    # If there are fewer than 3 items, pad with NaN
    info_list += [np.nan] * (3 - len(info_list))
    return info_list[:3]

# Apply the function to create separate actor and character columns
groupedd[['actor1_name', 'actor2_name', 'actor3_name']] = pd.DataFrame(
    groupedd['actor_name'].apply(extract_info).to_list(),
    index=groupedd.index
)

groupedd[['character1_name', 'character2_name', 'character3_name']] = pd.DataFrame(
    groupedd['character_name'].apply(extract_info).to_list(),
    index=groupedd.index
)

# Drop the original actor_name and character_name columns
groupedd = groupedd.drop(columns=['actor_name', 'character_name'])

# Merge the new columns back to the original DataFrame
resultt = with_BO.drop_duplicates('wikipedia_movie_id').merge(
    groupedd, on='wikipedia_movie_id', how='left'
)

# Keep only the relevant columns
columns_to_keep = ['wikipedia_movie_id', 'freebase_movie_id_x', 'movie_release_date_x', 
                   'actor1_name', 'actor2_name', 'actor3_name',
                   'character1_name', 'character2_name', 'character3_name', 'movie_box_office_revenue']
resultt = resultt[columns_to_keep]

resultt.sample(10)

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date_x,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue
928,6406105,/m/0g44ng,1991-02-01,Kelly Preston,Patrick Dempsey,,Karen Landers,Charlie Farrow,,4409328.0
1104,4944673,/m/0cwfgz,1989-08-04,Tom Sizemore,Sylvester Stallone,Danny Trejo,Dallas,Frank Leone,Chink's Gang Member,22099847.0
1061,5257744,/m/0db6mh,1982-04-02,Richard Pryor,Margot Kidder,Richard Pryor,Cpl. Eddie Keller,Toni Donovan,Ted Segal,23671186.0
1312,3497579,/m/09gr4b,1949-05-04,Fred Astaire,Ginger Rogers,,Josh Barkley,Dinah Barkley,,3200000.0
1563,2382573,/m/0780z8,1998-01,Holly Madison,,,Miss Lady Bright Eyes,,,4000000.0
1721,1840870,/m/0603hj,1993-07-30,Wesley Snipes,Tia Carrere,Sean Connery,Lt. Webster Smith,Jingo Asakuma,Capt. John Connor,107198790.0
72,30470501,/m/0fpvxnz,2010,Maki Horikita,,,Yukiho Karasawa,,,4593914.0
285,21586321,/m/05myy97,1994-09-09,Gabriel Byrne,Joanne Whalley,William Hurt,Daniel Graham,Valerie Alston,Tommy Vesey,6971777.0
525,12985644,/m/02z28m8,2007-10-05,Kellita Smith,,,Tanya,,,6451106.0
2110,657824,/m/030bg4,1994-07-29,Thomas Gibson,Mira Sorvino,,Dickie Taylor,Marta Ferrer,,7266973.0


In [68]:
resultt.shape

(2385, 10)

In [69]:
columns_to_drop = ['_merge', 'actor_1_name', 'actor_2_name', 'actor_3_name']
final_characters = final_characters.drop(columns=columns_to_drop)

In [70]:
valid_wikipedia_ids = final_characters['wikipedia_movie_id'].unique()
filtered_merged_movies = merged_movies[merged_movies['wikipedia_movie_id'].isin(valid_wikipedia_ids)]

In [71]:
unique_ids_counss = filtered_merged_movies['wikipedia_movie_id'].nunique()
unique_ids_counss

2496

In [72]:
web_scrape_sample_1 = filtered_merged_movies['wikipedia_movie_id']

In [73]:
web_scrape_sample_1.to_csv('../data/processed/web_scrape_sample_1.csv')

In [74]:
half1 = web_scrape_sample_1.iloc[:len(web_scrape_sample_1)//2]
half2 = web_scrape_sample_1.iloc[len(web_scrape_sample_1)//2:]


half1.to_csv('../data/processed/web_scrape_sample_half_1.csv')
half2.to_csv('../data/processed/web_scrape_sample_half_2.csv')

In [75]:
# I have resultt and filtered_merged_movies

combined__ = resultt.merge(filtered_merged_movies, on='wikipedia_movie_id', how='outer', indicator=True)

In [76]:
combined__[combined__['_merge'] == 'both']

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date_x,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue_x,...,movie_runtime,movie_languages,movie_countries,movie_genres,movie_title,title_year,actor_1_name,actor_2_name,actor_3_name,_merge
3,4227,/m/01c9d,1975-12-18,Ryan O'Neal,,,Barry Lyndon,,,20000000.0,...,187.0,"['French Language', 'English Language', 'Germa...","['United States of America', 'United Kingdom']","['Costume drama', 'Film adaptation', 'Period p...",Barry Lyndon,1975.0,Ryan O'Neal,Steven Berkoff,Hardy Krüger,both
9,8481,/m/02c6d,1980-06-23,Michael Caine,Angie Dickinson,,Doctor Robert Elliott,Kate Miller,,31899000.0,...,105.0,['English Language'],['United States of America'],"['Thriller', 'Horror', 'Erotica', 'Mystery', '...",Dressed to Kill,1980.0,Angie Dickinson,David Margulies,Nancy Allen,both
52,42159,/m/0bl5c,1946,Fredric March,Myrna Loy,,Al Stephenson,Milly Stephenson,,23650000.0,...,168.0,['English Language'],['United States of America'],"['Ensemble Film', 'Family Drama', 'Americana',...",The Best Years of Our Lives,1946.0,Myrna Loy,Teresa Wright,Dana Andrews,both
72,45568,/m/0ccsq,1989-08-09,Michael Biehn,Ed Harris,Mary Elizabeth Mastrantonio,Lt. Hiram Coffey,Virgil 'Bud' Brigman,Lindsey Brigman,90000098.0,...,145.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Drama']",The Abyss,1989.0,Michael Biehn,Todd Graff,Mary Elizabeth Mastrantonio,both
87,60155,/m/0gclv,1933,Mae West,Cary Grant,,Lady Lou,Captain Cummings,,2200000.0,...,66.0,['English Language'],['United States of America'],"['Romantic comedy', 'Crime Comedy', 'Screwball...",She Done Him Wrong,1933.0,Mae West,Gilbert Roland,Louise Beavers,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4651,31267756,/m/0gjc1d3,2012,Josh Lucas,Nicolas Cage,,Cab Driver,Will Montgomery,,2106557.0,...,96.0,['English Language'],['United States of America'],['Action'],Stolen,2012.0,Nicolas Cage,Mark Valley,Sami Gayle,both
4656,31363435,/m/0g6tx_3,2010,Jacques Gamblin,Sara Forestier,,Arthur Martin,Bahia Benmahmoud,,513836.0,...,102.0,['French Language'],['France'],"['Romantic comedy', 'World cinema', 'Drama', '...",The Names of Love,2010.0,Sara Forestier,Jacques Gamblin,Zinedine Soualem,both
4715,33157118,/m/0h7lg4p,2004-10-14,Adam Garcia,Jacqueline Bisset,,Scott Doherty,Maureen Doherty,,16066.0,...,95.0,['English Language'],"['United Kingdom', 'Germany']","['Thriller', 'Erotic thriller', 'Psychological...",Fascination,2004.0,Adam Garcia,Jacqueline Bisset,Alice Evans,both
4752,34643655,/m/0gkyxjc,2013-07-18,Ryan Reynolds,,,Turbo,,,83024900.0,...,93.0,['English Language'],['United States of America'],"['Computer Animation', 'Animation']",Turbo,2013.0,Ryan Reynolds,Snoop Dogg,Ben Schwartz,both


In [77]:
combined__[combined__['_merge'] == 'left_only']

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date_x,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue_x,...,movie_runtime,movie_languages,movie_countries,movie_genres,movie_title,title_year,actor_1_name,actor_2_name,actor_3_name,_merge
44,31908,/m/07vcx,1991-12-25,Sam Neill,William Hurt,,Eugene Fitzpatrick,Sam Farber,,752856.0,...,,,,,,,,,,left_only
50,41881,/m/0bj25,1950,Marilyn Monroe,Anne Baxter,Bette Davis,Miss Caswell,Eve Harrington,Margo Channing,2900000.0,...,,,,,,,,,,left_only
63,43849,/m/0b_5d,1960-06-15,Shirley MacLaine,Jack Lemmon,,Fran Kubelik,C.C. 'Bud' Baxter,,24600000.0,...,,,,,,,,,,left_only
65,44218,/m/0c1p3,1936-06-26,Clark Gable,Spencer Tracy,,Blackie Norton,Father Mullin,,2868000.0,...,,,,,,,,,,left_only
70,44752,/m/0c5wn,1932-04-12,Joan Crawford,John Barrymore,Greta Garbo,Flaemmchen,Baron Felix Von Gaigern,Elizaveta Grushinskaya,2250000.0,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4782,36353137,/m/0k3m48j,2013,Lindsay Lohan,,,Tara,,,49494.0,...,,,,,,,,,,left_only
4783,36353520,/m/0fprltf,1953-01-31,Randolph Scott,,,Major Ransome Callicut,,,2000000.0,...,,,,,,,,,,left_only
4784,36353890,/m/09rr39n,1953-01-14,Julie Newmar,,,Specialty Dancer,,,1250000.0,...,,,,,,,,,,left_only
4785,36354224,/m/09rz62m,1953-06-19,Anita Ekberg,Fess Parker,,Dancehall Girl,Long John,,1000000.0,...,,,,,,,,,,left_only


In [78]:
combined__[combined__['_merge'] == 'right_only']

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date_x,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue_x,...,movie_runtime,movie_languages,movie_countries,movie_genres,movie_title,title_year,actor_1_name,actor_2_name,actor_3_name,_merge
0,3217,,,,,,,,,,...,81.0,['English Language'],['United States of America'],"['Cult', 'Horror', 'Stop motion', 'Costume dra...",Army of Darkness,1992.0,Patricia Tallman,Bridget Fonda,Embeth Davidtz,right_only
1,3746,,,,,,,,,,...,116.0,"['Japanese Language', 'Cantonese', 'English La...","['United States of America', 'Hong Kong']","['Thriller', 'Cyberpunk', 'Science Fiction', '...",Blade Runner,1982.0,Harrison Ford,Sean Young,M. Emmet Walsh,right_only
2,3837,,,,,,,,,,...,93.0,"['Yiddish Language', 'English Language']",['United States of America'],"['Western', 'Satire', 'Comedy']",Blazing Saddles,1974.0,Madeline Kahn,David Huddleston,Harvey Korman,right_only
4,4560,,,,,,,,,,...,175.0,"['French Language', 'Latin Language', 'English...",['United States of America'],"['Biography', 'Adventure', 'History', 'War fil...",Braveheart,1995.0,Mhairi Calvey,Patrick McGoohan,James Robinson,right_only
5,4726,,,,,,,,,,...,126.0,"['French Language', 'English Language']","['United States of America', 'United Kingdom']","['Crime Fiction', 'Thriller', 'Superhero movie...",Batman,1989.0,Michael Gough,Jack Palance,William Hootkins,right_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4766,35729649,,,,,,,,,,...,93.0,[],['United States of America'],"['Thriller', 'Mystery', 'Crime Fiction', 'Drama']",End of Watch,2012.0,Jake Gyllenhaal,Anna Kendrick,America Ferrera,right_only
4773,35874076,,,,,,,,,,...,93.0,['English Language'],['France'],"['Thriller', 'Crime Fiction', 'Action', 'Drama']",Taken 2,2012.0,Liam Neeson,Luke Grimes,D.B. Sweeney,right_only
4776,36019569,,,,,,,,,,...,127.0,['English Language'],['United States of America'],"['Drama', 'Action', 'Fantasy', 'Adventure']",Snow White and the Huntsman,2012.0,Chris Hemsworth,Kristen Stewart,Sam Claflin,right_only
4786,36534974,,,,,,,,,,...,93.0,['English Language'],['United States of America'],"['Drama', 'Fantasy']",Noah,2014.0,Anthony Hopkins,Emma Watson,Logan Lerman,right_only


In [79]:
unique_ids_countsss = combined__['wikipedia_movie_id'].nunique()
unique_ids_countsss

4730

In [80]:
id_counts = combined__['wikipedia_movie_id'].value_counts()

# Filter for ids that appear only once
unique_ids = id_counts[id_counts == 1].index

# Filter the DataFrame to show only rows with these unique ids
unique_rows = combined__[~combined__['wikipedia_movie_id'].isin(unique_ids)]

unique_rows

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date_x,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue_x,...,movie_runtime,movie_languages,movie_countries,movie_genres,movie_title,title_year,actor_1_name,actor_2_name,actor_3_name,_merge
32,30275,,,,,,,,,,...,96.0,['English Language'],['United States of America'],"['Horror', 'Indie', 'Doomsday film', 'Black-an...",Night of the Living Dead,1968.0,Judith O'Dea,Duane Jones,S. William Hinzman,right_only
33,30275,,,,,,,,,,...,96.0,['English Language'],['United States of America'],"['Horror', 'Indie', 'Doomsday film', 'Black-an...",Night of the Living Dead,1968.0,Judith O'Dea,Duane Jones,S. William Hinzman,right_only
144,75933,,,,,,,,,,...,119.0,['English Language'],"['United States of America', 'United Kingdom']","['Costume drama', 'Period piece', 'Drama', 'Co...",Dangerous Liaisons,1988.0,Keanu Reeves,Peter Capaldi,Swoosie Kurtz,right_only
145,75933,,,,,,,,,,...,119.0,['English Language'],"['United States of America', 'United Kingdom']","['Costume drama', 'Period piece', 'Drama', 'Co...",Dangerous Liaisons,1988.0,Keanu Reeves,Peter Capaldi,Swoosie Kurtz,right_only
182,91133,,,,,,,,,,...,127.0,"['Japanese Language', 'English Language']","['United States of America', 'Japan']","['Action/Adventure', 'Japanese Movies', 'Adven...",The Karate Kid,1984.0,Martin Kove,William Zabka,William Bassett,right_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4563,28882162,,,,,,,,,,...,95.0,['English Language'],"['United States of America', 'India', 'United ...","['Thriller', 'Science Fiction', 'Action']",Dredd,2012.0,Wood Harris,Jason Cope,Rakie Ayola,right_only
4635,30951080,,,,,,,,,,...,93.0,[],[],"['Romance Film', 'Drama']",The Great Gatsby,2013.0,Leonardo DiCaprio,Elizabeth Debicki,Steve Bisley,right_only
4636,30951080,,,,,,,,,,...,93.0,[],[],"['Romance Film', 'Drama']",The Great Gatsby,2013.0,Leonardo DiCaprio,Elizabeth Debicki,Steve Bisley,right_only
4720,33279717,,,,,,,,,,...,102.0,['English Language'],['United States of America'],"['Science Fiction', 'Comedy']",The Watch,2012.0,Will Forte,Nicholas Braun,Rosemarie DeWitt,right_only


In [81]:
resultt = resultt.drop_duplicates(subset='wikipedia_movie_id', keep='first')
filtered_merged_movies = filtered_merged_movies.drop_duplicates(subset='wikipedia_movie_id', keep='first')
combined__ = combined__.drop_duplicates(subset='wikipedia_movie_id', keep='first')

In [89]:
resultt

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date_x,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue
0,37322106,/m/0hzp_vq,2012-11-13,Shahrukh Khan,,,Major Samar,,,3047539.0
1,36354224,/m/09rz62m,1953-06-19,Anita Ekberg,Fess Parker,,Dancehall Girl,Long John,,1000000.0
2,36353890,/m/09rr39n,1953-01-14,Julie Newmar,,,Specialty Dancer,,,1250000.0
3,36353520,/m/0fprltf,1953-01-31,Randolph Scott,,,Major Ransome Callicut,,,2000000.0
4,36353137,/m/0k3m48j,2013,Lindsay Lohan,,,Tara,,,49494.0
...,...,...,...,...,...,...,...,...,...,...
2380,42159,/m/0bl5c,1946,Fredric March,Myrna Loy,,Al Stephenson,Milly Stephenson,,23650000.0
2381,41881,/m/0bj25,1950,Marilyn Monroe,Anne Baxter,Bette Davis,Miss Caswell,Eve Harrington,Margo Channing,2900000.0
2382,31908,/m/07vcx,1991-12-25,Sam Neill,William Hurt,,Eugene Fitzpatrick,Sam Farber,,752856.0
2383,8481,/m/02c6d,1980-06-23,Michael Caine,Angie Dickinson,,Doctor Robert Elliott,Kate Miller,,31899000.0


In [88]:
filtered_merged_movies

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,movie_title,title_year,actor_1_name,actor_2_name,actor_3_name
0,975900,/m/03vyhn,Ghosts of Mars,2001.0,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...",Ghosts of Mars,2001.0,Jason Statham,Clea DuVall,Natasha Henstridge
1,171005,/m/016ywb,Henry V,1989.0,10161099.0,137.0,['English Language'],['United Kingdom'],"['Costume drama', 'War film', 'Epic', 'Period ...",Henry V,1989.0,Brian Blessed,Derek Jacobi,Danny Webb
2,77856,/m/0kcn7,Mary Poppins,1964.0,102272727.0,139.0,['English Language'],['United States of America'],"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",Mary Poppins,1964.0,Ed Wynn,Glynis Johns,Elsa Lanchester
3,156558,/m/014k4y,Baby Boy,2001.0,29381649.0,123.0,['English Language'],['United States of America'],"['Crime Fiction', 'Drama', 'Coming of age']",Baby Boy,2001.0,Mo'Nique,Snoop Dogg,Angell Conwell
5,1305819,/m/04rjwf,C.H.U.D.,1984.0,4650000.0,96.0,['English Language'],['United States of America'],"['Monster movie', 'Natural horror films', 'Sci...",C.H.U.D.,1984.0,Daniel Stern,John Heard,Sam McMurray
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3063,10149834,/m/02q3fdr,Ponyo,2008.0,202614288.0,101.0,['Japanese Language'],['Japan'],"['Japanese Movies', 'Adventure', 'World cinema...",Ponyo,2008.0,Rumi Hiiragi,Yûki Amami,Yuria Nara
3064,2472440,/m/07gf00,Turbulence,1997.0,11538235.0,101.0,['English Language'],['United States of America'],"['Thriller', 'Disaster', 'Action', 'Action/Adv...",Turbulence,1997.0,Hector Elizondo,Lauren Holly,Jeffrey DeMunn
3065,22427855,/m/05zkcsk,Adam,2009.0,2549605.0,99.0,['English Language'],['United States of America'],"['Indie', 'Comedy-drama', 'Drama', 'Comedy', '...",Adam,2009.0,Peter Gallagher,Terry Walters,Amy Irving
3066,303933,/m/01s7w3,Twister,1996.0,494471524.0,113.0,['English Language'],['United States of America'],"['Action/Adventure', 'Disaster']",Twister,1996.0,Philip Seymour Hoffman,Alan Ruck,Jami Gertz


In [84]:
combined__.shape

(4730, 24)

In [85]:
test = pd.read_csv('../data/processed/less_than_3.csv')

In [86]:
test

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date_x,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue
0,37322106,/m/0hzp_vq,2012-11-13,Shahrukh Khan,,,Major Samar,,,3047539.0
1,36354224,/m/09rz62m,1953-06-19,Anita Ekberg,Fess Parker,,Dancehall Girl,Long John,,1000000.0
2,36353890,/m/09rr39n,1953-01-14,Julie Newmar,,,Specialty Dancer,,,1250000.0
3,36353520,/m/0fprltf,1953-01-31,Randolph Scott,,,Major Ransome Callicut,,,2000000.0
4,36353137,/m/0k3m48j,2013,Lindsay Lohan,,,Tara,,,49494.0
...,...,...,...,...,...,...,...,...,...,...
2380,42159,/m/0bl5c,1946,Fredric March,Myrna Loy,,Al Stephenson,Milly Stephenson,,23650000.0
2381,41881,/m/0bj25,1950,Marilyn Monroe,Anne Baxter,Bette Davis,Miss Caswell,Eve Harrington,Margo Channing,2900000.0
2382,31908,/m/07vcx,1991-12-25,Sam Neill,William Hurt,,Eugene Fitzpatrick,Sam Farber,,752856.0
2383,8481,/m/02c6d,1980-06-23,Michael Caine,Angie Dickinson,,Doctor Robert Elliott,Kate Miller,,31899000.0


In [101]:
import cpi
def adjust_for_inflation(df, year):

    def get_year(date):
        if pd.isna(date):  # Check if the date is NaN
            return None
        if isinstance(date, (int, float)):  # If it's already a year
            return int(date)
        elif isinstance(date, str):  # If it's a string, try to parse it as a date
            try:
                return pd.to_datetime(date).year
            except ValueError:
                return None  # Return None if date can't be parsed
        elif hasattr(date, 'year'):  # If it's a datetime object
            return date.year
        return None

    if 'movie_release_date_x' in df.columns:
        df.rename(columns={'movie_release_date_x': 'movie_release_date'}, inplace=True)

    df['years_only'] = df.apply(
        lambda row: get_year(row['movie_release_date']),
        axis=1
    )

    df['adjusted_revenue'] = df.apply(
        lambda row: cpi.inflate(row['movie_box_office_revenue'], int(row['years_only']), to=year) 
        if pd.notna(row['movie_box_office_revenue']) and pd.notna(row['years_only']) else None,
        axis=1
    )
    return df

In [94]:
def get_year(date):
        if pd.isna(date):  # Check if the date is NaN
            return None
        if isinstance(date, (int, float)):  # If it's already a year
            return int(date)
        elif isinstance(date, str):  # If it's a string, try to parse it as a date
            try:
                return pd.to_datetime(date).year
            except ValueError:
                return None  # Return None if date can't be parsed
        elif hasattr(date, 'year'):  # If it's a datetime object
            return date.year
        return None
test['years_only'] = test.apply(
        lambda row: get_year(row['movie_release_date']),
        axis=1
    )

In [95]:
test

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue,years_only
0,37322106,/m/0hzp_vq,2012-11-13,Shahrukh Khan,,,Major Samar,,,3047539.0,2012.0
1,36354224,/m/09rz62m,1953-06-19,Anita Ekberg,Fess Parker,,Dancehall Girl,Long John,,1000000.0,1953.0
2,36353890,/m/09rr39n,1953-01-14,Julie Newmar,,,Specialty Dancer,,,1250000.0,1953.0
3,36353520,/m/0fprltf,1953-01-31,Randolph Scott,,,Major Ransome Callicut,,,2000000.0,1953.0
4,36353137,/m/0k3m48j,2013,Lindsay Lohan,,,Tara,,,49494.0,2013.0
...,...,...,...,...,...,...,...,...,...,...,...
2380,42159,/m/0bl5c,1946,Fredric March,Myrna Loy,,Al Stephenson,Milly Stephenson,,23650000.0,1946.0
2381,41881,/m/0bj25,1950,Marilyn Monroe,Anne Baxter,Bette Davis,Miss Caswell,Eve Harrington,Margo Channing,2900000.0,1950.0
2382,31908,/m/07vcx,1991-12-25,Sam Neill,William Hurt,,Eugene Fitzpatrick,Sam Farber,,752856.0,1991.0
2383,8481,/m/02c6d,1980-06-23,Michael Caine,Angie Dickinson,,Doctor Robert Elliott,Kate Miller,,31899000.0,1980.0


In [102]:
oue = adjust_for_inflation(test, year=2015)

KeyboardInterrupt: 

In [100]:
oue

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue,years_only,adjusted_revenue
0,37322106,/m/0hzp_vq,2012-11-13,Shahrukh Khan,,,Major Samar,,,3047539.0,2012.0,
1,36354224,/m/09rz62m,1953-06-19,Anita Ekberg,Fess Parker,,Dancehall Girl,Long John,,1000000.0,1953.0,
2,36353890,/m/09rr39n,1953-01-14,Julie Newmar,,,Specialty Dancer,,,1250000.0,1953.0,
3,36353520,/m/0fprltf,1953-01-31,Randolph Scott,,,Major Ransome Callicut,,,2000000.0,1953.0,
4,36353137,/m/0k3m48j,2013,Lindsay Lohan,,,Tara,,,49494.0,2013.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
2380,42159,/m/0bl5c,1946,Fredric March,Myrna Loy,,Al Stephenson,Milly Stephenson,,23650000.0,1946.0,
2381,41881,/m/0bj25,1950,Marilyn Monroe,Anne Baxter,Bette Davis,Miss Caswell,Eve Harrington,Margo Channing,2900000.0,1950.0,
2382,31908,/m/07vcx,1991-12-25,Sam Neill,William Hurt,,Eugene Fitzpatrick,Sam Farber,,752856.0,1991.0,
2383,8481,/m/02c6d,1980-06-23,Michael Caine,Angie Dickinson,,Doctor Robert Elliott,Kate Miller,,31899000.0,1980.0,


In [87]:
tests = pd.read_csv('../data/processed/imdbtest.csv')
tests

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,movie_title,title_year,actor_1_name,actor_2_name,actor_3_name
0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...",Ghosts of Mars,2001.0,Jason Statham,Clea DuVall,Natasha Henstridge
1,171005,/m/016ywb,Henry V,1989,10161099.0,137.0,['English Language'],['United Kingdom'],"['Costume drama', 'War film', 'Epic', 'Period ...",Henry V,1989.0,Brian Blessed,Derek Jacobi,Danny Webb
2,77856,/m/0kcn7,Mary Poppins,1964,102272727.0,139.0,['English Language'],['United States of America'],"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",Mary Poppins,1964.0,Ed Wynn,Glynis Johns,Elsa Lanchester
3,156558,/m/014k4y,Baby Boy,2001,29381649.0,123.0,['English Language'],['United States of America'],"['Crime Fiction', 'Drama', 'Coming of age']",Baby Boy,2001.0,Mo'Nique,Snoop Dogg,Angell Conwell
4,1305819,/m/04rjwf,C.H.U.D.,1984,4650000.0,96.0,['English Language'],['United States of America'],"['Monster movie', 'Natural horror films', 'Sci...",C.H.U.D.,1984.0,Daniel Stern,John Heard,Sam McMurray
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2491,10149834,/m/02q3fdr,Ponyo,2008,202614288.0,101.0,['Japanese Language'],['Japan'],"['Japanese Movies', 'Adventure', 'World cinema...",Ponyo,2008.0,Rumi Hiiragi,Yûki Amami,Yuria Nara
2492,2472440,/m/07gf00,Turbulence,1997,11538235.0,101.0,['English Language'],['United States of America'],"['Thriller', 'Disaster', 'Action', 'Action/Adv...",Turbulence,1997.0,Hector Elizondo,Lauren Holly,Jeffrey DeMunn
2493,22427855,/m/05zkcsk,Adam,2009,2549605.0,99.0,['English Language'],['United States of America'],"['Indie', 'Comedy-drama', 'Drama', 'Comedy', '...",Adam,2009.0,Peter Gallagher,Terry Walters,Amy Irving
2494,303933,/m/01s7w3,Twister,1996,494471524.0,113.0,['English Language'],['United States of America'],"['Action/Adventure', 'Disaster']",Twister,1996.0,Philip Seymour Hoffman,Alan Ruck,Jami Gertz


In [4]:
yo = pd.read_csv('../data/processed/movies_x_principal_characters.csv')

In [5]:
yo

Unnamed: 0,wikipedia_movie_id,freebase_movie_id_x,movie_release_date,actor1_name,actor2_name,actor3_name,character1_name,character2_name,character3_name,movie_box_office_revenue,years_only,adjusted_revenue
0,37322106,/m/0hzp_vq,2012-11-13,Shahrukh Khan,,,Major Samar,,,3047539.0,2012.0,3.146069e+06
1,36354224,/m/09rz62m,1953-06-19,Anita Ekberg,Fess Parker,,Dancehall Girl,Long John,,1000000.0,1953.0,8.877041e+06
2,36353890,/m/09rr39n,1953-01-14,Julie Newmar,,,Specialty Dancer,,,1250000.0,1953.0,1.109630e+07
3,36353520,/m/0fprltf,1953-01-31,Randolph Scott,,,Major Ransome Callicut,,,2000000.0,1953.0,1.775408e+07
4,36353137,/m/0k3m48j,2013,Lindsay Lohan,,,Tara,,,49494.0,2013.0,5.035659e+04
...,...,...,...,...,...,...,...,...,...,...,...,...
2380,42159,/m/0bl5c,1946,Fredric March,Myrna Loy,,Al Stephenson,Milly Stephenson,,23650000.0,1946.0,2.874591e+08
2381,41881,/m/0bj25,1950,Marilyn Monroe,Anne Baxter,Bette Davis,Miss Caswell,Eve Harrington,Margo Channing,2900000.0,1950.0,2.852072e+07
2382,31908,/m/07vcx,1991-12-25,Sam Neill,William Hurt,,Eugene Fitzpatrick,Sam Farber,,752856.0,1991.0,1.310130e+06
2383,8481,/m/02c6d,1980-06-23,Michael Caine,Angie Dickinson,,Doctor Robert Elliott,Kate Miller,,31899000.0,1980.0,9.175492e+07


In [1]:
import pandas as pd

In [4]:
yoz = pd.read_csv('../data/processed/movies_trope_summary_main_actors_IMDB.csv')
yoz

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_name,movie_release_date,movie_box_office_revenue,movie_runtime,movie_languages,movie_countries,movie_genres,movie_title,title_year,actor_1_name,actor_2_name,actor_3_name,years_only,adjusted_revenue
0,975900,/m/03vyhn,Ghosts of Mars,2001,14010832.0,98.0,['English Language'],['United States of America'],"['Thriller', 'Science Fiction', 'Horror', 'Adv...",Ghosts of Mars,2001.0,Jason Statham,Clea DuVall,Natasha Henstridge,2001,1.875102e+07
1,171005,/m/016ywb,Henry V,1989,10161099.0,137.0,['English Language'],['United Kingdom'],"['Costume drama', 'War film', 'Epic', 'Period ...",Henry V,1989.0,Brian Blessed,Derek Jacobi,Danny Webb,1989,1.942220e+07
2,77856,/m/0kcn7,Mary Poppins,1964,102272727.0,139.0,['English Language'],['United States of America'],"[""Children's/Family"", 'Musical', 'Fantasy', 'C...",Mary Poppins,1964.0,Ed Wynn,Glynis Johns,Elsa Lanchester,1964,7.819476e+08
3,156558,/m/014k4y,Baby Boy,2001,29381649.0,123.0,['English Language'],['United States of America'],"['Crime Fiction', 'Drama', 'Coming of age']",Baby Boy,2001.0,Mo'Nique,Snoop Dogg,Angell Conwell,2001,3.932214e+07
4,3550323,/m/09kzfd,Things to Do in Denver When You're Dead,1995,529677.0,115.0,['English Language'],['United States of America'],"['Thriller', 'Crime Fiction', 'Crime Comedy', ...",Things to Do in Denver When You're Dead,1995.0,Steve Buscemi,Bill Cobbs,Treat Williams,1995,8.237694e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2823,2472440,/m/07gf00,Turbulence,1997,11538235.0,101.0,['English Language'],['United States of America'],"['Thriller', 'Disaster', 'Action', 'Action/Adv...",Turbulence,1997.0,Hector Elizondo,Lauren Holly,Jeffrey DeMunn,1997,1.703899e+07
2824,22427855,/m/05zkcsk,Adam,2009,2549605.0,99.0,['English Language'],['United States of America'],"['Indie', 'Comedy-drama', 'Drama', 'Comedy', '...",Adam,2009.0,Peter Gallagher,Terry Walters,Amy Irving,2009,2.816762e+06
2825,303933,/m/01s7w3,Twister,1996,494471524.0,113.0,['English Language'],['United States of America'],"['Action/Adventure', 'Disaster']",Twister,1996.0,Philip Seymour Hoffman,Alan Ruck,Jami Gertz,1996,7.469608e+08
2826,25920477,/m/0b6lqyd,Source Code,2011,147332697.0,93.0,['English Language'],"['France', 'United States of America']","['Thriller', 'Science Fiction', 'Action/Advent...",Source Code,2011.0,Jake Gyllenhaal,Cas Anvar,Russell Peters,2011,1.552437e+08
