## **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast #to process trees of the Python abstract syntax grammar


## **Import Dataset**

### **2018**

In [None]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2018"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [None]:
frames = [df1, df2, df3, df4]

df = pd.concat(frames)
df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,J A N U A R Y,5,Insidious: The Last Key,Universal Pictures / Blumhouse Productions / S...,Adam Robitel (director); Leigh Whannell (scree...,[2]
1,J A N U A R Y,5,The Strange Ones,Vertical Entertainment,Christopher Radcliff (director/screenplay); La...,[3]
2,J A N U A R Y,12,The Commuter,Lionsgate / StudioCanal / The Picture Company,Jaume Collet-Serra (director); Byron Willinger...,[4]
3,J A N U A R Y,12,Proud Mary,Screen Gems,"Babak Najafi (director); John S. Newman, Chris...",[5]
4,J A N U A R Y,12,Acts of Violence,Lionsgate Premiere,Brett Donowho (director); Nicolas Aaron Mezzan...,[6]


In [None]:
#
! pip install -q tmdbv3api

In [None]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = '0e08f345ffc26f23f3d54b0f9afb65b7'


In [None]:
from tmdbv3api import Movie
tmdb_movie = Movie()

# this function will pass in the title of the movies then retrieve the genre of the movie from IMDB
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x) #the title will be searched in the tmdb_movie
    movie_id = result[0].id #we will match the "id" with the "title"
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key)) #we will get the result from the IMDb data
    data_json = response.json() #we will then convert it to a json file
    if data_json['genres']: #in the json file we will only need to extract the "genre"
        genre_str = " "
        for i in range(0,len(data_json['genres'])):
            genres.append(data_json['genres'][i]['name']) #we will then add the "genre" to the empty genre list we created above
        return genre_str.join(genres)
    else:
        np.NaN # we will return the results but if we don't find anything we will consider it as a missing value

In [None]:
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))
df.head(3)

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,J A N U A R Y,5,Insidious: The Last Key,Universal Pictures / Blumhouse Productions / S...,Adam Robitel (director); Leigh Whannell (scree...,[2],Horror Thriller
1,J A N U A R Y,5,The Strange Ones,Vertical Entertainment,Christopher Radcliff (director/screenplay); La...,[3],Thriller Drama
2,J A N U A R Y,12,The Commuter,Lionsgate / StudioCanal / The Picture Company,Jaume Collet-Serra (director); Byron Willinger...,[4],Action Thriller Mystery


In [None]:
df_2018 = df[['Title','Cast and crew','genres']]
df_2018.head()

Unnamed: 0,Title,Cast and crew,genres
0,Insidious: The Last Key,Adam Robitel (director); Leigh Whannell (scree...,Horror Thriller
1,The Strange Ones,Christopher Radcliff (director/screenplay); La...,Thriller Drama
2,The Commuter,Jaume Collet-Serra (director); Byron Willinger...,Action Thriller Mystery
3,Proud Mary,"Babak Najafi (director); John S. Newman, Chris...",Thriller Action Crime
4,Acts of Violence,Brett Donowho (director); Nicolas Aaron Mezzan...,Action Crime Thriller


In [None]:
df_2018.shape

(249, 3)

## **Directors**

In [None]:
pd.set_option('display.max_colwidth',250)
df['Cast and crew'].head(3)

Unnamed: 0,Cast and crew
0,"Adam Robitel (director); Leigh Whannell (screenplay); Lin Shaye, Angus Sampson, Leigh Whannell, Spencer Locke, Caitlin Gerard, Bruce Davison"
1,"Christopher Radcliff (director/screenplay); Lauren Wolkstein (director); Alex Pettyfer, James Freedson-Jackson, Emily Althaus, Gene Jones, Owen Campbell, Tobias Campbell"
2,"Jaume Collet-Serra (director); Byron Willinger, Philip de Blasi, Ryan Engle (screenplay); Liam Neeson, Vera Farmiga, Patrick Wilson, Jonathan Banks, Sam Neill"


In [None]:
def get_director(x):
    # Split the row into segments by ";"
    segments = x.split(";")

    # Step 1: Prioritize "(director)"
    for segment in segments:
        if "(director)" in segment:
            return segment.split(" (director)")[0].strip()

    # Step 2: Check for "(directors)" if "(director)" is not found
    for segment in segments:
        if "(directors)" in segment:
            return segment.split(" (directors)")[0].strip()

    # Step 3a: Check for "(director/screenplay)" if neither "(director)" nor "(directors)" is found
    for segment in segments:
        if "(director/screenplay)" in segment:
            return segment.split(" (director/screenplay)")[0].strip()

    # Step 3b: Check for "(directors/screenplay)" if neither "(director)" nor "(directors)" is found
    for segment in segments:
        if "(directors/screenplay)" in segment:
            return segment.split(" (directors/screenplay)")[0].strip()

    # Step 3c: Check for "(directors/screenplay)" if neither "(director)" nor "(directors)" is found
    for segment in segments:
        if "(director/screenplay/narrator))" in segment:
            return segment.split("(director/screenplay/narrator)	")[0].strip()

    # Step 4: If no valid director-related keywords are found, return NaN
    return np.NaN



In [None]:
df_2018['director_name'] = df_2018['Cast and crew'].map(lambda x: get_director(x))
df_2018.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['director_name'] = df_2018['Cast and crew'].map(lambda x: get_director(x))


Unnamed: 0,Title,Cast and crew,genres,director_name
0,Insidious: The Last Key,"Adam Robitel (director); Leigh Whannell (screenplay); Lin Shaye, Angus Sampson, Leigh Whannell, Spencer Locke, Caitlin Gerard, Bruce Davison",Horror Thriller,Adam Robitel
1,The Strange Ones,"Christopher Radcliff (director/screenplay); Lauren Wolkstein (director); Alex Pettyfer, James Freedson-Jackson, Emily Althaus, Gene Jones, Owen Campbell, Tobias Campbell",Thriller Drama,Lauren Wolkstein
2,The Commuter,"Jaume Collet-Serra (director); Byron Willinger, Philip de Blasi, Ryan Engle (screenplay); Liam Neeson, Vera Farmiga, Patrick Wilson, Jonathan Banks, Sam Neill",Action Thriller Mystery,Jaume Collet-Serra
3,Proud Mary,"Babak Najafi (director); John S. Newman, Christian Swegal, Steve Antin (screenplay); Taraji P. Henson, Jahi Di'Allo Winston, Billy Brown, Danny Glover",Thriller Action Crime,Babak Najafi
4,Acts of Violence,"Brett Donowho (director); Nicolas Aaron Mezzanatto (screenplay); Bruce Willis, Cole Hauser, Shawn Ashmore, Ashton Holmes, Melissa Bolona, Sophia Bush, Mike Epps",Action Crime Thriller,Brett Donowho


In [None]:
df_2018[df_2018['director_name'].isna()]

Unnamed: 0,Title,Cast and crew,genres,director_name
24,Black Panther,"Ryan Coogler (director/​screenplay); Joe Robert Cole (screenplay); Chadwick Boseman, Michael B. Jordan, Lupita Nyong'o, Danai Gurira, Martin Freeman, Daniel Kaluuya, Letitia Wright, Winston Duke, Sterling K. Brown, Angela Bassett, Forest Whitaker...",Action Adventure Science Fiction,
5,Hotel Transylvania 3: Summer Vacation,"Genndy Tartakovsky (director/screenplay[119]); Michael McCullers (screenplay); Adam Sandler, Andy Samberg, Selena Gomez, Kevin James, David Spade, Steve Buscemi, Keegan-Michael Key, Molly Shannon, Fran Drescher, Kathryn Hahn, Jim Gaffigan, Mel Br...",Animation Comedy Family Fantasy,
61,Fahrenheit 11/9,Michael Moore (director/screenplay/narrator),Documentary,
64,Night School,"Malcolm D. Lee (director[177]); Kevin Hart, Harry Ratchford, Joey Wells, Matt Kellard, Nicholas Stoller,[178] John Hamburg (screenplay); Kevin Hart, Tiffany Haddish, Rob Riggle, Romany Malco",Comedy,


## **Actors**

In [None]:
def get_actor1(x):
    # Remove parts that include "(director)" or "(screenplay)" or "(director/screenplay)"
    cleaned = "; ".join([part for part in x.split("; ") if not any(keyword in part for keyword in ["(director)", "(screenplay)", "(director/screenplay)"])])

    # Extract the first actor from the cleaned string
    if ", " in cleaned:
        return cleaned.split(", ")[0].strip()
    else:
        return np.NaN  # Return NaN if no actors are found


In [None]:
df_2018['actor_1_name'] = df_2018['Cast and crew'].map(lambda x: get_actor1(x))

df_2018

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['actor_1_name'] = df_2018['Cast and crew'].map(lambda x: get_actor1(x))


Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name
0,Insidious: The Last Key,"Adam Robitel (director); Leigh Whannell (screenplay); Lin Shaye, Angus Sampson, Leigh Whannell, Spencer Locke, Caitlin Gerard, Bruce Davison",Horror Thriller,Adam Robitel,Lin Shaye
1,The Strange Ones,"Christopher Radcliff (director/screenplay); Lauren Wolkstein (director); Alex Pettyfer, James Freedson-Jackson, Emily Althaus, Gene Jones, Owen Campbell, Tobias Campbell",Thriller Drama,Lauren Wolkstein,Alex Pettyfer
2,The Commuter,"Jaume Collet-Serra (director); Byron Willinger, Philip de Blasi, Ryan Engle (screenplay); Liam Neeson, Vera Farmiga, Patrick Wilson, Jonathan Banks, Sam Neill",Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson
3,Proud Mary,"Babak Najafi (director); John S. Newman, Christian Swegal, Steve Antin (screenplay); Taraji P. Henson, Jahi Di'Allo Winston, Billy Brown, Danny Glover",Thriller Action Crime,Babak Najafi,Taraji P. Henson
4,Acts of Violence,"Brett Donowho (director); Nicolas Aaron Mezzanatto (screenplay); Bruce Willis, Cole Hauser, Shawn Ashmore, Ashton Holmes, Melissa Bolona, Sophia Bush, Mike Epps",Action Crime Thriller,Brett Donowho,Bruce Willis
...,...,...,...,...,...
58,Second Act,"Peter Segal (director); Justin Zackham, Elaine Goldsmith-Thomas (screenplay); Jennifer Lopez, Leah Remini, Vanessa Hudgens, Treat Williams, Milo Ventimiglia",Romance Comedy,Peter Segal,Jennifer Lopez
59,Holmes & Watson,"Etan Cohen (director/screenplay); Will Ferrell, John C. Reilly, Rebecca Hall, Rob Brydon, Kelly Macdonald, Steve Coogan, Ralph Fiennes",Comedy Mystery Crime,Etan Cohen,Will Ferrell
60,Vice,"Adam McKay (director/screenplay); Christian Bale, Amy Adams, Steve Carell, Tyler Perry, Alison Pill, Lily Rabe, Jesse Plemons, Sam Rockwell",Thriller Science Fiction Action Adventure,Adam McKay,Christian Bale
61,On the Basis of Sex,"Mimi Leder (director); Daniel Stiepleman (screenplay); Felicity Jones, Armie Hammer, Justin Theroux, Sam Waterston, Kathy Bates",Drama History,Mimi Leder,Felicity Jones


In [None]:
df_2018[df_2018['actor_1_name'].isna()]

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name
4,Whitney,Kevin Macdonald (director/screenplay); Whitney Houston,Music Documentary,Kevin Macdonald,
28,A Prayer Before Dawn,"Jean-Stéphane Sauvaire (director); Jonathan Hirschbein, Nick Saltrese (screenplay); Joe Cole",Drama Action Crime,Jean-Stéphane Sauvaire,
61,Fahrenheit 11/9,Michael Moore (director/screenplay/narrator),Documentary,,
2,The Great Buster: A Celebration,Peter Bogdanovich (director/screenplay),Documentary Comedy,Peter Bogdanovich,


In [None]:
def get_actor2(x):
    # Remove parts that include "(director)" or "(screenplay)" or "(director/screenplay)"
    cleaned = "; ".join([part for part in x.split("; ") if not any(keyword in part for keyword in ["(director)", "(screenplay)", "(director/screenplay)"])])

    # Extract the second actor from the cleaned string
    actors = cleaned.split(", ")
    if len(actors) > 1:  # Check if there's a second actor
        return actors[1].strip()
    else:
        return np.NaN


In [None]:
df_2018['actor_2_name'] = df_2018['Cast and crew'].map(lambda x: get_actor2(x))

df_2018.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2018['actor_2_name'] = df_2018['Cast and crew'].map(lambda x: get_actor2(x))


Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name
0,Insidious: The Last Key,"Adam Robitel (director); Leigh Whannell (screenplay); Lin Shaye, Angus Sampson, Leigh Whannell, Spencer Locke, Caitlin Gerard, Bruce Davison",Horror Thriller,Adam Robitel,Lin Shaye,Angus Sampson
1,The Strange Ones,"Christopher Radcliff (director/screenplay); Lauren Wolkstein (director); Alex Pettyfer, James Freedson-Jackson, Emily Althaus, Gene Jones, Owen Campbell, Tobias Campbell",Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson
2,The Commuter,"Jaume Collet-Serra (director); Byron Willinger, Philip de Blasi, Ryan Engle (screenplay); Liam Neeson, Vera Farmiga, Patrick Wilson, Jonathan Banks, Sam Neill",Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga
3,Proud Mary,"Babak Najafi (director); John S. Newman, Christian Swegal, Steve Antin (screenplay); Taraji P. Henson, Jahi Di'Allo Winston, Billy Brown, Danny Glover",Thriller Action Crime,Babak Najafi,Taraji P. Henson,Jahi Di'Allo Winston
4,Acts of Violence,"Brett Donowho (director); Nicolas Aaron Mezzanatto (screenplay); Bruce Willis, Cole Hauser, Shawn Ashmore, Ashton Holmes, Melissa Bolona, Sophia Bush, Mike Epps",Action Crime Thriller,Brett Donowho,Bruce Willis,Cole Hauser


In [None]:
df_2018[df_2018['actor_2_name'].isna()]

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name
4,Whitney,Kevin Macdonald (director/screenplay); Whitney Houston,Music Documentary,Kevin Macdonald,,
28,A Prayer Before Dawn,"Jean-Stéphane Sauvaire (director); Jonathan Hirschbein, Nick Saltrese (screenplay); Joe Cole",Drama Action Crime,Jean-Stéphane Sauvaire,,
61,Fahrenheit 11/9,Michael Moore (director/screenplay/narrator),Documentary,,,
2,The Great Buster: A Celebration,Peter Bogdanovich (director/screenplay),Documentary Comedy,Peter Bogdanovich,,


In [None]:
def get_actor3(x):
    # Step 1: Split the string to remove everything before "screenplay); "
    parts = x.split("screenplay); ")

    # Step 2: Check if there are actors after the screenplay/director section
    if len(parts) > 1:
        actors = parts[-1].split(", ")  # Extract the actors part and split by commas
    else:
        return np.NaN  # Return NaN if no actors are present

    # Step 3: Check if there are at least three actors
    if len(actors) >= 3:
        return actors[2].strip()  # Return the third actor
    else:
        return np.NaN  # Return NaN if there are fewer than three actors


In [None]:
df_2018['actor_3_name'] = df_2018['Cast and crew'].map(lambda x: get_actor3(x))

df_2018.head()

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Insidious: The Last Key,"Adam Robitel (director); Leigh Whannell (screenplay); Lin Shaye, Angus Sampson, Leigh Whannell, Spencer Locke, Caitlin Gerard, Bruce Davison",Horror Thriller,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell
1,The Strange Ones,"Christopher Radcliff (director/screenplay); Lauren Wolkstein (director); Alex Pettyfer, James Freedson-Jackson, Emily Althaus, Gene Jones, Owen Campbell, Tobias Campbell",Thriller Drama,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus
2,The Commuter,"Jaume Collet-Serra (director); Byron Willinger, Philip de Blasi, Ryan Engle (screenplay); Liam Neeson, Vera Farmiga, Patrick Wilson, Jonathan Banks, Sam Neill",Action Thriller Mystery,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson
3,Proud Mary,"Babak Najafi (director); John S. Newman, Christian Swegal, Steve Antin (screenplay); Taraji P. Henson, Jahi Di'Allo Winston, Billy Brown, Danny Glover",Thriller Action Crime,Babak Najafi,Taraji P. Henson,Jahi Di'Allo Winston,Billy Brown
4,Acts of Violence,"Brett Donowho (director); Nicolas Aaron Mezzanatto (screenplay); Bruce Willis, Cole Hauser, Shawn Ashmore, Ashton Holmes, Melissa Bolona, Sophia Bush, Mike Epps",Action Crime Thriller,Brett Donowho,Bruce Willis,Cole Hauser,Shawn Ashmore


In [None]:
df_2018[df_2018['actor_3_name'].isna()]

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
14,A Fantastic Woman,"Sebastián Lelio (director/screenplay); Gonzalo Maza (screenplay); Daniela Vega, Francisco Reyes Morandé",Drama,Sebastián Lelio,Daniela Vega,Francisco Reyes Morandé,
19,Pad Man,"R. Balki (director/screenplay); Swanand Kirkire (screenplay); Akshay Kumar, Radhika Apte",Comedy Drama,R. Balki,Akshay Kumar,Radhika Apte,
19,Kings,"Deniz Gamze Ergüven (director/screenplay); Halle Berry, Daniel Craig",Romance Drama Crime,Deniz Gamze Ergüven,Halle Berry,Daniel Craig,
38,Action Point,"Tim Kirkby (director); John Altschuler, Dave Krinsky (screenplay); Johnny Knoxville, Chris Pontius",Comedy,Tim Kirkby,Johnny Knoxville,Chris Pontius,
39,Adrift,"Baltasar Kormákur (director); Aaron Kandell, Jordan Kandell, David Branson Smith (screenplay); Shailene Woodley, Sam Claflin",Thriller Romance Adventure,Baltasar Kormákur,Shailene Woodley,Sam Claflin,
44,Won't You Be My Neighbor?,"Morgan Neville (director); Fred Rogers, François Clemmons, Yo-Yo Ma, Joe Negri, David Newell, Tom Junod, Joanne Rogers",Documentary,Morgan Neville,Fred Rogers,François Clemmons,
53,Damsel,"David Zellner, Nathan Zellner (directors/screenplay); Robert Pattinson, Mia Wasikowska",Fantasy Action Adventure,"David Zellner, Nathan Zellner",David Zellner,Nathan Zellner (directors/screenplay); Robert Pattinson,
4,Whitney,Kevin Macdonald (director/screenplay); Whitney Houston,Music Documentary,Kevin Macdonald,,,
23,Never Goin' Back,"Augustine Frizzell (director/screenplay); Maia Mitchell, Camila Morrone",Comedy,Augustine Frizzell,Maia Mitchell,Camila Morrone,
28,A Prayer Before Dawn,"Jean-Stéphane Sauvaire (director); Jonathan Hirschbein, Nick Saltrese (screenplay); Joe Cole",Drama Action Crime,Jean-Stéphane Sauvaire,,,


In [None]:
df_2018 = df_2018.rename(columns={'Title':'movie_title'})

new_df18 = df_2018.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]



In [None]:
new_df18['actor_2_name'] = new_df18['actor_2_name'].replace(np.nan, 'unknown')
new_df18['actor_3_name'] = new_df18['actor_3_name'].replace(np.nan, 'unknown')

In [None]:
new_df18['movie_title'] = new_df18['movie_title'].str.lower()

In [None]:
new_df18['combination'] = new_df18['actor_1_name'] + ' ' + new_df18['actor_2_name'] + ' '+ new_df18['actor_3_name'] + ' '+ new_df18['director_name'] +' ' + new_df18['genres']
new_df18.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,combination
0,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell,Horror Thriller,insidious: the last key,Lin Shaye Angus Sampson Leigh Whannell Adam Robitel Horror Thriller
1,Lauren Wolkstein,Alex Pettyfer,James Freedson-Jackson,Emily Althaus,Thriller Drama,the strange ones,Alex Pettyfer James Freedson-Jackson Emily Althaus Lauren Wolkstein Thriller Drama
2,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson,Action Thriller Mystery,the commuter,Liam Neeson Vera Farmiga Patrick Wilson Jaume Collet-Serra Action Thriller Mystery
3,Babak Najafi,Taraji P. Henson,Jahi Di'Allo Winston,Billy Brown,Thriller Action Crime,proud mary,Taraji P. Henson Jahi Di'Allo Winston Billy Brown Babak Najafi Thriller Action Crime
4,Brett Donowho,Bruce Willis,Cole Hauser,Shawn Ashmore,Action Crime Thriller,acts of violence,Bruce Willis Cole Hauser Shawn Ashmore Brett Donowho Action Crime Thriller


### **2019**


In [None]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2019"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]


frames = [df1, df2, df3, df4]

df_b = pd.concat(frames)
df_b.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,J A N U A R Y,4,Escape Room,Columbia Pictures / Original Film,"Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen",[2]
1,J A N U A R Y,4,Rust Creek,IFC Films / Lunacy Productions,"Jen McGowan (director); Julie Lipson (screenplay); Hermione Corfield, Jay Paulson, Sean O'Bryan, Micah Hauptman",[3]
2,J A N U A R Y,4,American Hangman,Hangman Justice Productions,"Wilson Coneybeare (director/screenplay); Donald Sutherland, Vincent Kartheiser, Oliver Dennis, Paul Braunstein",[4]
3,J A N U A R Y,11,A Dog's Way Home,Columbia Pictures,"Charles Martin Smith (director); W. Bruce Cameron (screenplay); Bryce Dallas Howard, Edward James Olmos, Alexandra Shipp, Ashley Judd, Jonah Haur-King, Wes Studi",[5]
4,J A N U A R Y,11,The Upside,STX Entertainment,"Neil Burger (director); Jon Hartmere (screenplay); Bryan Cranston, Kevin Hart, Nicole Kidman",[6]


In [None]:
df_b['genres'] = df_b['Title'].map(lambda x: get_genre(str(x)))
df_b.head(3)

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,J A N U A R Y,4,Escape Room,Columbia Pictures / Original Film,"Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen",[2],Horror Thriller Mystery
1,J A N U A R Y,4,Rust Creek,IFC Films / Lunacy Productions,"Jen McGowan (director); Julie Lipson (screenplay); Hermione Corfield, Jay Paulson, Sean O'Bryan, Micah Hauptman",[3],Thriller Drama Action Crime
2,J A N U A R Y,4,American Hangman,Hangman Justice Productions,"Wilson Coneybeare (director/screenplay); Donald Sutherland, Vincent Kartheiser, Oliver Dennis, Paul Braunstein",[4],Thriller


In [None]:
df_2019 = df_b[['Title','Cast and crew','genres']]
df_2019.head()

Unnamed: 0,Title,Cast and crew,genres
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen",Horror Thriller Mystery
1,Rust Creek,"Jen McGowan (director); Julie Lipson (screenplay); Hermione Corfield, Jay Paulson, Sean O'Bryan, Micah Hauptman",Thriller Drama Action Crime
2,American Hangman,"Wilson Coneybeare (director/screenplay); Donald Sutherland, Vincent Kartheiser, Oliver Dennis, Paul Braunstein",Thriller
3,A Dog's Way Home,"Charles Martin Smith (director); W. Bruce Cameron (screenplay); Bryce Dallas Howard, Edward James Olmos, Alexandra Shipp, Ashley Judd, Jonah Haur-King, Wes Studi",Drama Adventure Family
4,The Upside,"Neil Burger (director); Jon Hartmere (screenplay); Bryan Cranston, Kevin Hart, Nicole Kidman",Comedy Drama


In [None]:
df_2019.shape

(250, 3)

In [None]:
df_2019['director_name'] = df_2019['Cast and crew'].map(lambda x: get_director(x))
df_2019.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['director_name'] = df_2019['Cast and crew'].map(lambda x: get_director(x))


Unnamed: 0,Title,Cast and crew,genres,director_name
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen",Horror Thriller Mystery,Adam Robitel
1,Rust Creek,"Jen McGowan (director); Julie Lipson (screenplay); Hermione Corfield, Jay Paulson, Sean O'Bryan, Micah Hauptman",Thriller Drama Action Crime,Jen McGowan
2,American Hangman,"Wilson Coneybeare (director/screenplay); Donald Sutherland, Vincent Kartheiser, Oliver Dennis, Paul Braunstein",Thriller,Wilson Coneybeare
3,A Dog's Way Home,"Charles Martin Smith (director); W. Bruce Cameron (screenplay); Bryce Dallas Howard, Edward James Olmos, Alexandra Shipp, Ashley Judd, Jonah Haur-King, Wes Studi",Drama Adventure Family,Charles Martin Smith
4,The Upside,"Neil Burger (director); Jon Hartmere (screenplay); Bryan Cranston, Kevin Hart, Nicole Kidman",Comedy Drama,Neil Burger


In [None]:
df_2019[df_2019['director_name'].isna()]

Unnamed: 0,Title,Cast and crew,genres,director_name
34,Wonder Park,"Josh Appelbaum, Andre Nemec (screenplay); Brianna Denski, Ken Hudson Campbell, Matthew Broderick, Jennifer Garner, Kenan Thompson, Ken Jeong, Mila Kunis, John Oliver",Comedy Animation Adventure Family Fantasy,
35,The Professor and the Madman,"P.B. Shemran (director, screenplay); Todd Komarnicki (screenplay); Mel Gibson, Sean Penn, Natalie Dormer, Eddie Marsan, Jennifer Ehle, David O'Hara, Ioan Gruffudd, Stephen Dillane, Steve Coogan",History Drama Mystery Thriller,
41,The Tomorrow Man,"Noble Jones (director, screenplay); John Lithgow, Blythe Danner, Derek Cecil, Katie Aselton, Sophie Thatcher, Eve Harlow",Drama Romance,


## **Actors**

In [None]:
df_2019['actor_1_name'] = df_2019['Cast and crew'].map(lambda x: get_actor1(x))

df_2019.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['actor_1_name'] = df_2019['Cast and crew'].map(lambda x: get_actor1(x))


Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen",Horror Thriller Mystery,Adam Robitel,Taylor Russell
1,Rust Creek,"Jen McGowan (director); Julie Lipson (screenplay); Hermione Corfield, Jay Paulson, Sean O'Bryan, Micah Hauptman",Thriller Drama Action Crime,Jen McGowan,Hermione Corfield
2,American Hangman,"Wilson Coneybeare (director/screenplay); Donald Sutherland, Vincent Kartheiser, Oliver Dennis, Paul Braunstein",Thriller,Wilson Coneybeare,Donald Sutherland
3,A Dog's Way Home,"Charles Martin Smith (director); W. Bruce Cameron (screenplay); Bryce Dallas Howard, Edward James Olmos, Alexandra Shipp, Ashley Judd, Jonah Haur-King, Wes Studi",Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard
4,The Upside,"Neil Burger (director); Jon Hartmere (screenplay); Bryan Cranston, Kevin Hart, Nicole Kidman",Comedy Drama,Neil Burger,Bryan Cranston


In [None]:
df_2019['actor_2_name'] = df_2019['Cast and crew'].map(lambda x: get_actor2(x))

df_2019.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['actor_2_name'] = df_2019['Cast and crew'].map(lambda x: get_actor2(x))


Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen",Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller
1,Rust Creek,"Jen McGowan (director); Julie Lipson (screenplay); Hermione Corfield, Jay Paulson, Sean O'Bryan, Micah Hauptman",Thriller Drama Action Crime,Jen McGowan,Hermione Corfield,Jay Paulson
2,American Hangman,"Wilson Coneybeare (director/screenplay); Donald Sutherland, Vincent Kartheiser, Oliver Dennis, Paul Braunstein",Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser
3,A Dog's Way Home,"Charles Martin Smith (director); W. Bruce Cameron (screenplay); Bryce Dallas Howard, Edward James Olmos, Alexandra Shipp, Ashley Judd, Jonah Haur-King, Wes Studi",Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos
4,The Upside,"Neil Burger (director); Jon Hartmere (screenplay); Bryan Cranston, Kevin Hart, Nicole Kidman",Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart


In [None]:
def get_actor3(x):
    # Step 1: Split the string to remove everything before "screenplay); "
    parts = x.split("screenplay); ")

    # Step 2: Check if there are actors after the screenplay/director section
    if len(parts) > 1:
        actors = parts[-1].split(", ")  # Extract the actors part and split by commas
    else:
        return np.NaN  # Return NaN if no actors are present

    # Step 3: Check if there are at least three actors
    if len(actors) >= 3:
        return actors[2].strip()  # Return the third actor
    else:
        return np.NaN  # Return NaN if there are fewer than three actors

In [None]:
df_2019['actor_3_name'] = df_2019['Cast and crew'].map(lambda x: get_actor3(x))

df_2019.head()

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,Escape Room,"Adam Robitel (director); Bragi F. Schut, Maria Melnik (screenplay); Taylor Russell, Logan Miller, Deborah Ann Woll, Tyler Labine, Jay Ellis, Nik Dodani, Yorick van Wageningen",Horror Thriller Mystery,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll
1,Rust Creek,"Jen McGowan (director); Julie Lipson (screenplay); Hermione Corfield, Jay Paulson, Sean O'Bryan, Micah Hauptman",Thriller Drama Action Crime,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan
2,American Hangman,"Wilson Coneybeare (director/screenplay); Donald Sutherland, Vincent Kartheiser, Oliver Dennis, Paul Braunstein",Thriller,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis
3,A Dog's Way Home,"Charles Martin Smith (director); W. Bruce Cameron (screenplay); Bryce Dallas Howard, Edward James Olmos, Alexandra Shipp, Ashley Judd, Jonah Haur-King, Wes Studi",Drama Adventure Family,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp
4,The Upside,"Neil Burger (director); Jon Hartmere (screenplay); Bryan Cranston, Kevin Hart, Nicole Kidman",Comedy Drama,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman


In [None]:
df_2019 = df_2019.rename(columns={'Title':'movie_title'})

new_df19 = df_2019.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [None]:
new_df19['actor_2_name'] = new_df19['actor_2_name'].replace(np.nan, 'unknown')
new_df19['actor_3_name'] = new_df19['actor_3_name'].replace(np.nan, 'unknown')

new_df19['movie_title'] = new_df19['movie_title'].str.lower()

In [None]:
new_df19['combination'] = new_df19['actor_1_name'] + ' ' + new_df19['actor_2_name'] + ' '+ new_df19['actor_3_name'] + ' '+ new_df19['director_name'] +' ' + new_df19['genres']
new_df19.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,combination
0,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll,Horror Thriller Mystery,escape room,Taylor Russell Logan Miller Deborah Ann Woll Adam Robitel Horror Thriller Mystery
1,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan,Thriller Drama Action Crime,rust creek,Hermione Corfield Jay Paulson Sean O'Bryan Jen McGowan Thriller Drama Action Crime
2,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis,Thriller,american hangman,Donald Sutherland Vincent Kartheiser Oliver Dennis Wilson Coneybeare Thriller
3,Charles Martin Smith,Bryce Dallas Howard,Edward James Olmos,Alexandra Shipp,Drama Adventure Family,a dog's way home,Bryce Dallas Howard Edward James Olmos Alexandra Shipp Charles Martin Smith Drama Adventure Family
4,Neil Burger,Bryan Cranston,Kevin Hart,Nicole Kidman,Comedy Drama,the upside,Bryan Cranston Kevin Hart Nicole Kidman Neil Burger Comedy Drama


In [None]:
frames = [new_df19,new_df18]

new_df = pd.concat(frames)
new_df.head(3)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,combination
0,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll,Horror Thriller Mystery,escape room,Taylor Russell Logan Miller Deborah Ann Woll Adam Robitel Horror Thriller Mystery
1,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan,Thriller Drama Action Crime,rust creek,Hermione Corfield Jay Paulson Sean O'Bryan Jen McGowan Thriller Drama Action Crime
2,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis,Thriller,american hangman,Donald Sutherland Vincent Kartheiser Oliver Dennis Wilson Coneybeare Thriller


In [None]:
old_df = pd.read_csv('/content/df_movie_2.csv')
old_df.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,combination
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi Action Adventure Fantasy Sci-Fi
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Action Adventure Fantasy
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman Action Adventure Thriller
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt Action Thriller
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens,Doug Walker Rob Walker unknown Documentary


In [None]:
frames = [new_df,old_df]

final_df = pd.concat(frames)
final_df.head(3)

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,combination
0,Adam Robitel,Taylor Russell,Logan Miller,Deborah Ann Woll,Horror Thriller Mystery,escape room,Taylor Russell Logan Miller Deborah Ann Woll Adam Robitel Horror Thriller Mystery
1,Jen McGowan,Hermione Corfield,Jay Paulson,Sean O'Bryan,Thriller Drama Action Crime,rust creek,Hermione Corfield Jay Paulson Sean O'Bryan Jen McGowan Thriller Drama Action Crime
2,Wilson Coneybeare,Donald Sutherland,Vincent Kartheiser,Oliver Dennis,Thriller,american hangman,Donald Sutherland Vincent Kartheiser Oliver Dennis Wilson Coneybeare Thriller


In [None]:
final_df = final_df.dropna(how='any')

In [None]:
final_df.to_csv('/content/df_movie_3.csv',index=False)