# **Extracting features of 2021 movies from Wikipedia**

In [1]:
import pandas as pd
import numpy as np

In [2]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2021"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [3]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [4]:
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,J A N U A R Y,1.0,Shadow in the Cloud,Vertical Entertainment,Roseanne Liang (director/screenplay); Max Land...,[2]
1,J A N U A R Y,5.0,Hacksaw,Leone Films / Midnight Releasing,"Anthony Leone (director/screenplay); Amy Cay, ...",[3]
2,J A N U A R Y,12.0,Dr. Bird's Advice for Sad Poets,Relativity Media / Ketchup Entertainment,Yaniv Raz (director/screenplay); Lucas Jade Zu...,[4]
3,J A N U A R Y,13.0,The White Tiger,Netflix / ARRAY / Purple Pebble Pictures,Ramin Bahrani (director/screenplay); Adarsh Go...,
4,J A N U A R Y,14.0,Locked Down,HBO Max / Warner Bros. Pictures,Doug Liman (director); Steven Knight (screenpl...,[5]
...,...,...,...,...,...,...
361,D E C E M B E R,25.0,The Tragedy of Macbeth,Apple TV+ / A24 / IAC Films,Joel Coen (director/screenplay); Denzel Washin...,[278]
362,D E C E M B E R,25.0,A Journal for Jordan,Columbia Pictures / Escape Artists / Bron Studios,Denzel Washington (director); Virgil Williams ...,[279]
363,D E C E M B E R,25.0,American Underdog,Lionsgate,"Erwin brothers (directors); Jon Erwin, David A...",[280]
364,D E C E M B E R,26.0,Memoria,Neon,Apichatpong Weerasethakul (director/acreenplay...,[281]


In [5]:
pip install tmdbv3api

Collecting tmdbv3api
  Downloading tmdbv3api-1.9.0-py3-none-any.whl.metadata (8.0 kB)
Downloading tmdbv3api-1.9.0-py3-none-any.whl (25 kB)
Installing collected packages: tmdbv3api
Successfully installed tmdbv3api-1.9.0


In [6]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = '9e7e8763d8a867d76e3180aa764b5e65'

In [9]:
# getting the genres for our movies dataset

from tmdbv3api import Movie
tmdb_movie = Movie()
def get_genre(x):
    genres = []
    result = tmdb_movie.search(x)

    # Debugging: Print the result to inspect its structure
    #print("Search result for '{}':".format(x), result)

    # Check if result is empty
    if not result:
        return np.NaN

    # Access the id based on the structure of result[0]
    try:
      #Access the id directly as it's a dictionary
        movie_id = result[0]["id"]
    except (IndexError, TypeError, KeyError) as e:
        print(f"Error accessing movie ID for title '{x}':", e)
        return np.NaN

    # Fetch genre data from the API
    response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}')
    data_json = response.json()

    if data_json.get('genres'):
        for genre in data_json['genres']:
            genres.append(genre['name'])
        return " ".join(genres)
    else:
        return np.NaN

In [10]:
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))
df

Error accessing movie ID for title 'Dating and New York': getattr(): attribute name must be string


Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,J A N U A R Y,1.0,Shadow in the Cloud,Vertical Entertainment,Roseanne Liang (director/screenplay); Max Land...,[2],Horror Action War
1,J A N U A R Y,5.0,Hacksaw,Leone Films / Midnight Releasing,"Anthony Leone (director/screenplay); Amy Cay, ...",[3],Adventure Family TV Movie Western
2,J A N U A R Y,12.0,Dr. Bird's Advice for Sad Poets,Relativity Media / Ketchup Entertainment,Yaniv Raz (director/screenplay); Lucas Jade Zu...,[4],Comedy Drama
3,J A N U A R Y,13.0,The White Tiger,Netflix / ARRAY / Purple Pebble Pictures,Ramin Bahrani (director/screenplay); Adarsh Go...,,Drama
4,J A N U A R Y,14.0,Locked Down,HBO Max / Warner Bros. Pictures,Doug Liman (director); Steven Knight (screenpl...,[5],Comedy Crime Romance
...,...,...,...,...,...,...,...
361,D E C E M B E R,25.0,The Tragedy of Macbeth,Apple TV+ / A24 / IAC Films,Joel Coen (director/screenplay); Denzel Washin...,[278],Drama War
362,D E C E M B E R,25.0,A Journal for Jordan,Columbia Pictures / Escape Artists / Bron Studios,Denzel Washington (director); Virgil Williams ...,[279],Drama Romance
363,D E C E M B E R,25.0,American Underdog,Lionsgate,"Erwin brothers (directors); Jon Erwin, David A...",[280],Drama Family
364,D E C E M B E R,26.0,Memoria,Neon,Apichatpong Weerasethakul (director/acreenplay...,[281],Drama Science Fiction Mystery


In [13]:
df_2021 = df[['Title', 'Cast and crew', 'genres']]
df_2021.head()

Unnamed: 0,Title,Cast and crew,genres
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War
1,Hacksaw,"Anthony Leone (director/screenplay); Amy Cay, ...",Adventure Family TV Movie Western
2,Dr. Bird's Advice for Sad Poets,Yaniv Raz (director/screenplay); Lucas Jade Zu...,Comedy Drama
3,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Drama
4,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Romance


**Extracting Directors**

In [14]:
import re
def extract_director(cast_and_crew):
    # Regex to capture the name before (director) or variations
    match = re.search(r'([A-Za-z\s\.\-]+)\s*\(director(?:/screenplay)?', cast_and_crew)
    if match:
        return match.group(1).strip()

In [16]:
df_2021

Unnamed: 0,Title,Cast and crew,genres
0,Shadow in the Cloud,Roseanne Liang (director/screenplay); Max Land...,Horror Action War
1,Hacksaw,"Anthony Leone (director/screenplay); Amy Cay, ...",Adventure Family TV Movie Western
2,Dr. Bird's Advice for Sad Poets,Yaniv Raz (director/screenplay); Lucas Jade Zu...,Comedy Drama
3,The White Tiger,Ramin Bahrani (director/screenplay); Adarsh Go...,Drama
4,Locked Down,Doug Liman (director); Steven Knight (screenpl...,Comedy Crime Romance
...,...,...,...
361,The Tragedy of Macbeth,Joel Coen (director/screenplay); Denzel Washin...,Drama War
362,A Journal for Jordan,Denzel Washington (director); Virgil Williams ...,Drama Romance
363,American Underdog,"Erwin brothers (directors); Jon Erwin, David A...",Drama Family
364,Memoria,Apichatpong Weerasethakul (director/acreenplay...,Drama Science Fiction Mystery


In [17]:
df_2021 = df_2021[:-1]

In [18]:
df_2021['director_name'] = df_2021['Cast and crew'].apply(extract_director)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['director_name'] = df_2021['Cast and crew'].apply(extract_director)


**Extracting Actor 1 Name**

In [20]:
def get_actor1(x):
    # Get the initial actor name using screenplay split logic
    initial_actor = (x.split("screenplay); ")[-1]).split(", ")[0]
    # Check if there’s a semicolon in the result
    if ";" in initial_actor:
        # If there is, take the part after the last semicolon
        return initial_actor.split(";")[-1].strip()
    else:
        # Otherwise, return the initial result
        return initial_actor.strip()

In [21]:
df_2021['actor_1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['actor_1_name'] = df_2021['Cast and crew'].map(lambda x: get_actor1(x))


**Extracting Actor 2 Name**

In [22]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [23]:
df_2021['actor_2_name'] = df_2021['Cast and crew'].map(lambda x: get_actor2(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['actor_2_name'] = df_2021['Cast and crew'].map(lambda x: get_actor2(x))


**Extracting Actor 3 Name**

In [24]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [25]:
df_2021['actor_3_name'] = df_2021['Cast and crew'].map(lambda x: get_actor3(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2021['actor_3_name'] = df_2021['Cast and crew'].map(lambda x: get_actor3(x))


In [27]:
df_2021 = df_2021.rename(columns={'Title':'movie_title'})

In [28]:
df_2021_new = df_2021.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [29]:
df_2021_new['actor_2_name'] = df_2021_new['actor_2_name'].replace(np.nan, 'unknown')
df_2021_new['actor_3_name'] = df_2021_new['actor_3_name'].replace(np.nan, 'unknown')

In [30]:
df_2021_new['movie_title'] = df_2021_new['movie_title'].str.lower()

In [31]:
df_2021_new['col_merge'] = df_2021_new['actor_1_name'] + ' ' + df_2021_new['actor_2_name'] + ' '+ df_2021_new['actor_3_name'] + ' '+ df_2021_new['director_name'] +' ' + df_2021_new['genres']
df_2021_new

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,col_merge
0,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Horror Action War,shadow in the cloud,Chloë Grace Moretz Taylor John Smith Beulah Ko...
1,Anthony Leone,Amy Cay,Brian Patrick Butler,Michael C. Burgess,Adventure Family TV Movie Western,hacksaw,Amy Cay Brian Patrick Butler Michael C. Burges...
2,Yaniv Raz,Lucas Jade Zumann,Taylor Russell,Chase Stokes,Comedy Drama,dr. bird's advice for sad poets,Lucas Jade Zumann Taylor Russell Chase Stokes ...
3,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Drama,the white tiger,Adarsh Gourav Rajkummar Rao Priyanka Chopra Jo...
4,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Comedy Crime Romance,locked down,Anne Hathaway Chiwetel Ejiofor Stephen Merchan...
...,...,...,...,...,...,...,...
360,Matthew Vaughn,Ralph Fiennes,Gemma Arterton,Rhys Ifans,Action Adventure Thriller,the king's man,Ralph Fiennes Gemma Arterton Rhys Ifans Matthe...
361,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel,Drama War,the tragedy of macbeth,Denzel Washington Frances McDormand Bertie Car...
362,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian,Drama Romance,a journal for jordan,Michael B. Jordan Chanté Adams Jalon Christian...
363,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid,Drama Family,american underdog,Zachary Levi Anna Paquin Dennis Quaid Erwin br...


# **Extracting features of 2022 movies from Wikipedia**

In [32]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2022"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [None]:
#df = df[:-4]

In [33]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [38]:
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,J A N U A R Y,7.0,The 355,Universal Pictures / Freckle Films / FilmNatio...,Simon Kinberg (director/screenplay); Theresa R...,[2]
1,J A N U A R Y,7.0,The Legend of La Llorona,Saban Films / Ageless Pictures,Patricia Harris Seeley (director); José Prende...,[3]
2,J A N U A R Y,7.0,The Commando,Saban Films / Premiere Entertainment,Asif Akbar (director); Koji Steven Sakai (scre...,[4]
3,J A N U A R Y,7.0,American Siege,Vertical Entertainment,Edward John Drake (director/screenplay); Timot...,[5]
4,J A N U A R Y,14.0,Scream,Paramount Pictures / Spyglass Media Group / Ra...,"Matt Bettinelli-Olpin, Tyler Gillett (director...",[6]
...,...,...,...,...,...,...
312,D E C E M B E R,23.0,Whitney Houston: I Wanna Dance with Somebody,TriStar Pictures / Black Label Media,Kasi Lemmons (director); Anthony McCarten (scr...,[56]
313,D E C E M B E R,23.0,The Pale Blue Eye,Netflix / Cross Creek Pictures,Scott Cooper (director/screenplay); Christian ...,[183]
314,D E C E M B E R,23.0,Women Talking,Orion Pictures / Plan B Entertainment,Sarah Polley (director/screenplay); Rooney Mar...,[263]
315,D E C E M B E R,29.0,A Man Called Otto,Columbia Pictures / Stage 6 Films / Playtone /...,"Marc Forster (director), David Magee (screenpl...",[264]


In [39]:
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))
df

Error accessing movie ID for title 'Apollo 10 1⁄2: A Space Age Childhood': getattr(): attribute name must be string


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))


Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,J A N U A R Y,7.0,The 355,Universal Pictures / Freckle Films / FilmNatio...,Simon Kinberg (director/screenplay); Theresa R...,[2],Action Adventure Thriller
1,J A N U A R Y,7.0,The Legend of La Llorona,Saban Films / Ageless Pictures,Patricia Harris Seeley (director); José Prende...,[3],Family Animation Fantasy Horror
2,J A N U A R Y,7.0,The Commando,Saban Films / Premiere Entertainment,Asif Akbar (director); Koji Steven Sakai (scre...,[4],Action Crime Thriller
3,J A N U A R Y,7.0,American Siege,Vertical Entertainment,Edward John Drake (director/screenplay); Timot...,[5],Action Adventure Thriller
4,J A N U A R Y,14.0,Scream,Paramount Pictures / Spyglass Media Group / Ra...,"Matt Bettinelli-Olpin, Tyler Gillett (director...",[6],Crime Horror Mystery
...,...,...,...,...,...,...,...
312,D E C E M B E R,23.0,Whitney Houston: I Wanna Dance with Somebody,TriStar Pictures / Black Label Media,Kasi Lemmons (director); Anthony McCarten (scr...,[56],Music History Drama
313,D E C E M B E R,23.0,The Pale Blue Eye,Netflix / Cross Creek Pictures,Scott Cooper (director/screenplay); Christian ...,[183],Thriller Crime Horror Mystery
314,D E C E M B E R,23.0,Women Talking,Orion Pictures / Plan B Entertainment,Sarah Polley (director/screenplay); Rooney Mar...,[263],Drama
315,D E C E M B E R,29.0,A Man Called Otto,Columbia Pictures / Stage 6 Films / Playtone /...,"Marc Forster (director), David Magee (screenpl...",[264],Comedy Drama


In [42]:
df_2022 = df[['Title', 'Cast and crew', 'genres']]
df_2022.head()

Unnamed: 0,Title,Cast and crew,genres
0,The 355,Simon Kinberg (director/screenplay); Theresa R...,Action Adventure Thriller
1,The Legend of La Llorona,Patricia Harris Seeley (director); José Prende...,Family Animation Fantasy Horror
2,The Commando,Asif Akbar (director); Koji Steven Sakai (scre...,Action Crime Thriller
3,American Siege,Edward John Drake (director/screenplay); Timot...,Action Adventure Thriller
4,Scream,"Matt Bettinelli-Olpin, Tyler Gillett (director...",Crime Horror Mystery


In [43]:
df_2022['director_name'] = df_2022['Cast and crew'].apply(extract_director)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['director_name'] = df_2022['Cast and crew'].apply(extract_director)


In [44]:
df_2022['actor_1_name'] = df_2022['Cast and crew'].map(lambda x: get_actor1(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['actor_1_name'] = df_2022['Cast and crew'].map(lambda x: get_actor1(x))


In [45]:
df_2022['actor_2_name'] = df_2022['Cast and crew'].map(lambda x: get_actor2(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2022['actor_2_name'] = df_2022['Cast and crew'].map(lambda x: get_actor2(x))


In [46]:
df_2022['actor_3_name'] = df_2022['Cast and crew'].map(lambda x: get_actor3(x))

In [48]:
#df_2022

In [49]:
df_2022 = df_2022.rename(columns={'Title':'movie_title'})

In [50]:
df_2022_new = df_2022.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [51]:
df_2022_new['actor_2_name'] = df_2022_new['actor_2_name'].replace(np.nan, 'unknown')
df_2022_new['actor_3_name'] = df_2022_new['actor_3_name'].replace(np.nan, 'unknown')

In [52]:
df_2022_new['movie_title'] = df_2022_new['movie_title'].str.lower()

In [53]:
df_2022_new['col_merge'] = df_2022_new['actor_1_name'] + ' ' + df_2022_new['actor_2_name'] + ' '+ df_2022_new['actor_3_name'] + ' '+ df_2022_new['director_name'] +' ' + df_2022_new['genres']
df_2022_new.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,col_merge
0,Simon Kinberg,Jessica Chastain,Penélope Cruz,Fan Bingbing,Action Adventure Thriller,the 355,Jessica Chastain Penélope Cruz Fan Bingbing Si...
1,Patricia Harris Seeley,Autumn Reeser,Danny Trejo,Antonio Cupo,Family Animation Fantasy Horror,the legend of la llorona,Autumn Reeser Danny Trejo Antonio Cupo Patrici...
2,Asif Akbar,Mickey Rourke,Michael Jai White,unknown,Action Crime Thriller,the commando,Mickey Rourke Michael Jai White unknown Asif A...
3,Edward John Drake,Timothy V. Murphy,Bruce Willis,Rob Gough,Action Adventure Thriller,american siege,Timothy V. Murphy Bruce Willis Rob Gough Edwar...
4,Tyler Gillett,Melissa Barrera,Mason Gooding,Jenna Ortega,Crime Horror Mystery,scream,Melissa Barrera Mason Gooding Jenna Ortega Tyl...


In [54]:
my_df = pd.concat([df_2021_new, df_2022_new], ignore_index=True)

In [55]:
my_df.isna().sum()

Unnamed: 0,0
director_name,0
actor_1_name,0
actor_2_name,0
actor_3_name,0
genres,2
movie_title,0
col_merge,2


In [56]:
my_df = my_df.dropna(how='any')

In [57]:
data_till_2020 = pd.read_csv('/content/data_till_2020.csv')
data_till_2020

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,col_merge
0,Adam Robitel,Lin Shaye,Angus Sampson,Leigh Whannell,Horror Thriller,insidious: the last key,Lin Shaye Angus Sampson Leigh Whannell Adam Ro...
1,Christopher Radcliff,Alex Pettyfer,James Freedson-Jackson,Emily Althaus,Drama Mystery,the strange ones,Alex Pettyfer James Freedson-Jackson Emily Alt...
2,Jaume Collet-Serra,Liam Neeson,Vera Farmiga,Patrick Wilson,Action Thriller Mystery,the commuter,Liam Neeson Vera Farmiga Patrick Wilson Jaume ...
3,Babak Najafi,Taraji P. Henson,Jahi Di'Allo Winston,Billy Brown,Thriller Action Crime,proud mary,Taraji P. Henson Jahi Di'Allo Winston Billy Br...
4,Brett Donowho,Bruce Willis,Cole Hauser,Shawn Ashmore,Action Crime Thriller,acts of violence,Bruce Willis Cole Hauser Shawn Ashmore Brett D...
...,...,...,...,...,...,...,...
6103,Pete Docter,Jamie Foxx,Tina Fey,Graham Norton,Animation Family Comedy Fantasy,soul,Jamie Foxx Tina Fey Graham Norton Pete Docter ...
6104,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Family Action Fantasy Comedy,we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...
6105,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...
6106,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...


In [58]:
data_till_2022 = pd.concat([my_df, data_till_2020], ignore_index=True)
#data_till_2022

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,col_merge
0,Roseanne Liang,Chloë Grace Moretz,Taylor John Smith,Beulah Koale,Horror Action War,shadow in the cloud,Chloë Grace Moretz Taylor John Smith Beulah Ko...
1,Anthony Leone,Amy Cay,Brian Patrick Butler,Michael C. Burgess,Adventure Family TV Movie Western,hacksaw,Amy Cay Brian Patrick Butler Michael C. Burges...
2,Yaniv Raz,Lucas Jade Zumann,Taylor Russell,Chase Stokes,Comedy Drama,dr. bird's advice for sad poets,Lucas Jade Zumann Taylor Russell Chase Stokes ...
3,Ramin Bahrani,Adarsh Gourav,Rajkummar Rao,Priyanka Chopra Jonas,Drama,the white tiger,Adarsh Gourav Rajkummar Rao Priyanka Chopra Jo...
4,Doug Liman,Anne Hathaway,Chiwetel Ejiofor,Stephen Merchant,Comedy Crime Romance,locked down,Anne Hathaway Chiwetel Ejiofor Stephen Merchan...
...,...,...,...,...,...,...,...
6783,Pete Docter,Jamie Foxx,Tina Fey,Graham Norton,Animation Family Comedy Fantasy,soul,Jamie Foxx Tina Fey Graham Norton Pete Docter ...
6784,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Family Action Fantasy Comedy,we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...
6785,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...
6786,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...


In [59]:
data_till_2022.isna().sum()

Unnamed: 0,0
director_name,2
actor_1_name,0
actor_2_name,0
actor_3_name,0
genres,0
movie_title,0
col_merge,0


In [60]:
data_till_2022 = data_till_2022.dropna(how='any')

In [65]:
data_till_2022.to_csv('data_till_2022.csv', index=False)

# **Extracting features of 2023 movies from Wikipedia**

In [61]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2023"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [62]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [63]:
df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.
0,J A N U A R Y,6,M3GAN,Universal Pictures / Blumhouse Productions / A...,Gerard Johnstone (director); Akela Cooper (scr...,[3]
1,J A N U A R Y,6,The Old Way,Saban Films / Saturn Films,Brett Donowho (director); Carl W. Lucas (scree...,[4]
2,J A N U A R Y,11,The Devil Conspiracy,Samuel Goldwyn Films,Nathan Frankowski (director); Ed Alan (screenp...,[5]
3,J A N U A R Y,13,Plane,Lionsgate / MadRiver Pictures / Di Bonaventura...,Jean-François Richet (director); Charles Cummi...,[6]
4,J A N U A R Y,13,House Party,Warner Bros. Pictures / New Line Cinema,"Calmatic (director); Jamal Olori, Stephen Glov...",[7]


In [68]:
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))
df.head()

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,Ref.,genres
0,J A N U A R Y,6,M3GAN,Universal Pictures / Blumhouse Productions / A...,Gerard Johnstone (director); Akela Cooper (scr...,[3],Science Fiction Horror
1,J A N U A R Y,6,The Old Way,Saban Films / Saturn Films,Brett Donowho (director); Carl W. Lucas (scree...,[4],Western Drama
2,J A N U A R Y,11,The Devil Conspiracy,Samuel Goldwyn Films,Nathan Frankowski (director); Ed Alan (screenp...,[5],Horror Fantasy Science Fiction Thriller
3,J A N U A R Y,13,Plane,Lionsgate / MadRiver Pictures / Di Bonaventura...,Jean-François Richet (director); Charles Cummi...,[6],Action Adventure Thriller
4,J A N U A R Y,13,House Party,Warner Bros. Pictures / New Line Cinema,"Calmatic (director); Jamal Olori, Stephen Glov...",[7],Comedy


In [69]:
df_2023 = df[['Title', 'Cast and crew', 'genres']]
df_2023.head()

Unnamed: 0,Title,Cast and crew,genres
0,M3GAN,Gerard Johnstone (director); Akela Cooper (scr...,Science Fiction Horror
1,The Old Way,Brett Donowho (director); Carl W. Lucas (scree...,Western Drama
2,The Devil Conspiracy,Nathan Frankowski (director); Ed Alan (screenp...,Horror Fantasy Science Fiction Thriller
3,Plane,Jean-François Richet (director); Charles Cummi...,Action Adventure Thriller
4,House Party,"Calmatic (director); Jamal Olori, Stephen Glov...",Comedy


In [70]:
df_2023['director_name'] = df_2023['Cast and crew'].apply(extract_director)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023['director_name'] = df_2023['Cast and crew'].apply(extract_director)


In [71]:
df_2023['actor_1_name'] = df_2023['Cast and crew'].map(lambda x: get_actor1(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023['actor_1_name'] = df_2023['Cast and crew'].map(lambda x: get_actor1(x))


In [72]:
df_2023['actor_2_name'] = df_2023['Cast and crew'].map(lambda x: get_actor2(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2023['actor_2_name'] = df_2023['Cast and crew'].map(lambda x: get_actor2(x))


In [73]:
df_2023['actor_3_name'] = df_2023['Cast and crew'].map(lambda x: get_actor3(x))

In [74]:
df_2023

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,M3GAN,Gerard Johnstone (director); Akela Cooper (scr...,Science Fiction Horror,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald
1,The Old Way,Brett Donowho (director); Carl W. Lucas (scree...,Western Drama,Brett Donowho,Nicolas Cage,Ryan Kiera Armstrong,
2,The Devil Conspiracy,Nathan Frankowski (director); Ed Alan (screenp...,Horror Fantasy Science Fiction Thriller,Nathan Frankowski,Alice Orr-Ewing,Joe Doyle,Eveline Hall
3,Plane,Jean-François Richet (director); Charles Cummi...,Action Adventure Thriller,ois Richet,Gerard Butler,Mike Colter,Yoson An
4,House Party,"Calmatic (director); Jamal Olori, Stephen Glov...",Comedy,Calmatic,Tosin Cole,Jacob Latimore,Karen Obilom
...,...,...,...,...,...,...,...
338,Memory,Michel Franco (director/screenplay); Jessica C...,Action Thriller Crime,Michel Franco,Jessica Chastain,Peter Sarsgaard,Merritt Wever
339,The Color Purple,"Blitz Bazawule (director), Marcus Gardley (scr...",Drama,Blitz Bazawule,Fantasia Barrino,Taraji P. Henson,Danielle Brooks
340,The Boys in the Boat,"George Clooney (director), Mark L. Smith (scre...",Drama History,George Clooney,Callum Turner,Joel Edgerton,Peter Guinness
341,Ferrari,"Michael Mann (director), Troy Kennedy Martin (...",History Drama,Michael Mann,Adam Driver,Penelope Cruz,Shailene Woodley


In [75]:
df_2023 = df_2023.rename(columns={'Title':'movie_title'})

In [76]:
df_2023_new = df_2023.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [77]:
df_2023_new['actor_2_name'] = df_2023_new['actor_2_name'].replace(np.nan, 'unknown')
df_2023_new['actor_3_name'] = df_2023_new['actor_3_name'].replace(np.nan, 'unknown')

In [78]:
df_2023_new['movie_title'] = df_2023_new['movie_title'].str.lower()

In [79]:
df_2023_new['col_merge'] = df_2023_new['actor_1_name'] + ' ' + df_2023_new['actor_2_name'] + ' '+ df_2023_new['actor_3_name'] + ' '+ df_2023_new['director_name'] +' ' + df_2023_new['genres']
df_2023_new.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,col_merge
0,Gerard Johnstone,Allison Williams,Violet McGraw,Amie Donald,Science Fiction Horror,m3gan,Allison Williams Violet McGraw Amie Donald Ger...
1,Brett Donowho,Nicolas Cage,Ryan Kiera Armstrong,unknown,Western Drama,the old way,Nicolas Cage Ryan Kiera Armstrong unknown Bret...
2,Nathan Frankowski,Alice Orr-Ewing,Joe Doyle,Eveline Hall,Horror Fantasy Science Fiction Thriller,the devil conspiracy,Alice Orr-Ewing Joe Doyle Eveline Hall Nathan ...
3,ois Richet,Gerard Butler,Mike Colter,Yoson An,Action Adventure Thriller,plane,Gerard Butler Mike Colter Yoson An ois Richet ...
4,Calmatic,Tosin Cole,Jacob Latimore,Karen Obilom,Comedy,house party,Tosin Cole Jacob Latimore Karen Obilom Calmati...


In [80]:
data_till_2023 = pd.concat([data_till_2022, df_2023_new], ignore_index=True)

In [81]:
data_till_2023.isna().sum()

Unnamed: 0,0
director_name,0
actor_1_name,0
actor_2_name,0
actor_3_name,0
genres,0
movie_title,0
col_merge,0


# **Extracting features of 2024 movies from Wikipedia**

In [82]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2024"
df1 = pd.read_html(link, header=0)[2]
df2 = pd.read_html(link, header=0)[3]
df3 = pd.read_html(link, header=0)[4]
df4 = pd.read_html(link, header=0)[5]

In [83]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [84]:
df.head()

Unnamed: 0,Opening,Title,Production company,Cast and crew,Ref.,Opening.1
0,January 2,The Mummy Murders,Gravitas Ventures,Colin Bressler (director/screenplay); Will Don...,[3],
1,January 3,Self Reliance,"Neon, Hulu , MRC , Paramount Global Content Di...",Jake Johnson (director/screenplay); Jake Johns...,[4],
2,January 4,DarkGame,Gravitas Ventures,"Howard J. Ford (director); Gary Grant, Niall J...",[5],
3,January 5,Night Swim,"Universal Pictures, Blumhouse Productions , At...",Bryce McGuire (director/screenplay); Wyatt Rus...,[6],
4,January 5,He Went That Way,"Vertical Entertainment, Mister Smith Entertain...",Jeffrey Darling (director); Evan M. Wiener (sc...,[7],


In [85]:
df['genres'] = df['Title'].map(lambda x: get_genre(str(x)))
df.head()

Error accessing movie ID for title 'The Geeks (Los Frikis)': getattr(): attribute name must be string


Unnamed: 0,Opening,Title,Production company,Cast and crew,Ref.,Opening.1,genres
0,January 2,The Mummy Murders,Gravitas Ventures,Colin Bressler (director/screenplay); Will Don...,[3],,Horror Crime
1,January 3,Self Reliance,"Neon, Hulu , MRC , Paramount Global Content Di...",Jake Johnson (director/screenplay); Jake Johns...,[4],,Comedy Thriller
2,January 4,DarkGame,Gravitas Ventures,"Howard J. Ford (director); Gary Grant, Niall J...",[5],,Horror Thriller
3,January 5,Night Swim,"Universal Pictures, Blumhouse Productions , At...",Bryce McGuire (director/screenplay); Wyatt Rus...,[6],,Horror
4,January 5,He Went That Way,"Vertical Entertainment, Mister Smith Entertain...",Jeffrey Darling (director); Evan M. Wiener (sc...,[7],,Thriller Crime Drama


In [86]:
df_2024 = df[['Title', 'Cast and crew', 'genres']]
df_2024.head()

Unnamed: 0,Title,Cast and crew,genres
0,The Mummy Murders,Colin Bressler (director/screenplay); Will Don...,Horror Crime
1,Self Reliance,Jake Johnson (director/screenplay); Jake Johns...,Comedy Thriller
2,DarkGame,"Howard J. Ford (director); Gary Grant, Niall J...",Horror Thriller
3,Night Swim,Bryce McGuire (director/screenplay); Wyatt Rus...,Horror
4,He Went That Way,Jeffrey Darling (director); Evan M. Wiener (sc...,Thriller Crime Drama


In [88]:
df_2024

Unnamed: 0,Title,Cast and crew,genres
0,The Mummy Murders,Colin Bressler (director/screenplay); Will Don...,Horror Crime
1,Self Reliance,Jake Johnson (director/screenplay); Jake Johns...,Comedy Thriller
2,DarkGame,"Howard J. Ford (director); Gary Grant, Niall J...",Horror Thriller
3,Night Swim,Bryce McGuire (director/screenplay); Wyatt Rus...,Horror
4,He Went That Way,Jeffrey Darling (director); Evan M. Wiener (sc...,Thriller Crime Drama
...,...,...,...
469,Nosferatu,Robert Eggers (director/screenplay); Bill Skar...,Drama Fantasy Horror
470,A Complete Unknown,James Mangold (director/screenplay); Jay Cocks...,Drama Music History
471,The Fire Inside,Rachel Morrison (director); Barry Jenkins (scr...,Drama
472,Babygirl,Halina Reijn (director/screenplay); Nicole Kid...,Drama


In [89]:
non_string_rows = df_2024[~df_2024['Cast and crew'].apply(lambda x: isinstance(x, str))]

# Print the rows
print(non_string_rows)

    Title Cast and crew       genres
233   NaN           NaN  Drama Crime


In [90]:
df_2024 = df_2024.drop(233) #Dropping row 233 as it was NaN

In [91]:
df_2024['director_name'] = df_2024['Cast and crew'].apply(extract_director)

In [92]:
df_2024['actor_1_name'] = df_2024['Cast and crew'].map(lambda x: get_actor1(x))

In [93]:
df_2024['actor_2_name'] = df_2024['Cast and crew'].map(lambda x: get_actor2(x))

In [94]:
df_2024['actor_3_name'] = df_2024['Cast and crew'].map(lambda x: get_actor3(x))

In [95]:
df_2024

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,The Mummy Murders,Colin Bressler (director/screenplay); Will Don...,Horror Crime,Colin Bressler,Leila Annastasia Scott,Jason Scarbrough,Jeff Caperton
1,Self Reliance,Jake Johnson (director/screenplay); Jake Johns...,Comedy Thriller,Jake Johnson,Jake Johnson,Anna Kendrick,Natalie Morales
2,DarkGame,"Howard J. Ford (director); Gary Grant, Niall J...",Horror Thriller,Howard J. Ford,Ed Westwick,Andrew P. Stephen,Natalya Tsvetkova
3,Night Swim,Bryce McGuire (director/screenplay); Wyatt Rus...,Horror,Bryce McGuire,Wyatt Russell,Kerry Condon,Amélie Hoeferle
4,He Went That Way,Jeffrey Darling (director); Evan M. Wiener (sc...,Thriller Crime Drama,Jeffrey Darling,Jacob Elordi,Zachary Quinto,Patrick J. Adams
...,...,...,...,...,...,...,...
469,Nosferatu,Robert Eggers (director/screenplay); Bill Skar...,Drama Fantasy Horror,Robert Eggers,Bill Skarsgård,Nicholas Hoult,Lily-Rose Depp
470,A Complete Unknown,James Mangold (director/screenplay); Jay Cocks...,Drama Music History,James Mangold,Timothée Chalamet,Edward Norton,Elle Fanning
471,The Fire Inside,Rachel Morrison (director); Barry Jenkins (scr...,Drama,Rachel Morrison,Ryan Destiny,Brian Tyree Henry,Judy Greer
472,Babygirl,Halina Reijn (director/screenplay); Nicole Kid...,Drama,Halina Reijn,Nicole Kidman,Harris Dickinson,Sophie Wilde


In [96]:
df_2024 = df_2024.rename(columns={'Title':'movie_title'})

In [97]:
df_2024_new = df_2024.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [98]:
df_2024_new['actor_2_name'] = df_2024_new['actor_2_name'].replace(np.nan, 'unknown')
df_2024_new['actor_3_name'] = df_2024_new['actor_3_name'].replace(np.nan, 'unknown')

In [99]:
df_2024_new['movie_title'] = df_2024_new['movie_title'].str.lower()

In [100]:
df_2024_new['col_merge'] = df_2024_new['actor_1_name'] + ' ' + df_2024_new['actor_2_name'] + ' '+ df_2024_new['actor_3_name'] + ' '+ df_2024_new['director_name'] +' ' + df_2024_new['genres']
df_2024_new.head()

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,col_merge
0,Colin Bressler,Leila Annastasia Scott,Jason Scarbrough,Jeff Caperton,Horror Crime,the mummy murders,Leila Annastasia Scott Jason Scarbrough Jeff C...
1,Jake Johnson,Jake Johnson,Anna Kendrick,Natalie Morales,Comedy Thriller,self reliance,Jake Johnson Anna Kendrick Natalie Morales Jak...
2,Howard J. Ford,Ed Westwick,Andrew P. Stephen,Natalya Tsvetkova,Horror Thriller,darkgame,Ed Westwick Andrew P. Stephen Natalya Tsvetkov...
3,Bryce McGuire,Wyatt Russell,Kerry Condon,Amélie Hoeferle,Horror,night swim,Wyatt Russell Kerry Condon Amélie Hoeferle Bry...
4,Jeffrey Darling,Jacob Elordi,Zachary Quinto,Patrick J. Adams,Thriller Crime Drama,he went that way,Jacob Elordi Zachary Quinto Patrick J. Adams J...


In [101]:
data_till_2024 = pd.concat([data_till_2023, df_2024_new], ignore_index=True)

In [102]:
data_till_2024.isna().sum()

Unnamed: 0,0
director_name,1
actor_1_name,0
actor_2_name,0
actor_3_name,0
genres,1
movie_title,0
col_merge,2


In [103]:
data_till_2024 = data_till_2024.dropna(how='any')

In [104]:
data_till_2024.isna().sum()

Unnamed: 0,0
director_name,0
actor_1_name,0
actor_2_name,0
actor_3_name,0
genres,0
movie_title,0
col_merge,0


In [105]:
data_till_2024.to_csv('data_till_2024.csv', index=False)