In [19]:
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request

## Extracting features of 2020 movies from Wikipedia

In [20]:
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"

In [21]:
source = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(source,'lxml')

In [22]:
tables = soup.find_all('table',class_='wikitable sortable')

In [23]:
len(tables)

4

In [24]:
type(tables[0])

bs4.element.Tag

In [25]:
df1 = pd.read_html(str(tables[0]))[0]
df2 = pd.read_html(str(tables[1]))[0]
df3 = pd.read_html(str(tables[2]))[0]
df4 = pd.read_html(str(tables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'

In [26]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [27]:
df

Unnamed: 0,Opening,Opening.1,Title,Production company,Cast and crew,.mw-parser-output .tooltip-dotted{border-bottom:1px dotted;cursor:help}Ref.,Ref.
0,J A N U A R Y,3,The Grudge,Screen Gems / Stage 6 Films / Ghost House Pict...,Nicolas Pesce (director/screenplay); Andrea Ri...,[2],
1,J A N U A R Y,10,Underwater,20th Century Fox / TSG Entertainment / Chernin...,"William Eubank (director); Brian Duffield, Ada...",[3],
2,J A N U A R Y,10,Like a Boss,Paramount Pictures / Artists First,"Miguel Arteta (director); Sam Pitman, Adam Col...",[4],
3,J A N U A R Y,10,Three Christs,IFC Films,Jon Avnet (director/screenplay); Eric Nazarian...,,
4,J A N U A R Y,10,Inherit the Viper,Lionsgate / Barry Films / Tycor International ...,Anthony Jerjen (director); Andrew Crabtree (sc...,[5],
...,...,...,...,...,...,...,...
272,D E C E M B E R,25,We Can Be Heroes,Netflix / Troublemaker Studios / Double R Prod...,Robert Rodriguez (director/screenplay); Priyan...,,[245]
273,D E C E M B E R,25,News of the World,Universal Pictures / Playtone / Perfect World ...,Paul Greengrass (director/screenplay); Luke Da...,,[246]
274,D E C E M B E R,25,One Night in Miami...,Amazon Studios,Regina King (director); Kemp Powers (screenpla...,,[247]
275,D E C E M B E R,25,Promising Young Woman,Focus Features / FilmNation Entertainment,Emerald Fennell (director/screenplay); Carey M...,,[248]


In [28]:
df_2020 = df[['Title','Cast and crew']]

In [29]:
df_2020

Unnamed: 0,Title,Cast and crew
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...
1,Underwater,"William Eubank (director); Brian Duffield, Ada..."
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col..."
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...
...,...,...
272,We Can Be Heroes,Robert Rodriguez (director/screenplay); Priyan...
273,News of the World,Paul Greengrass (director/screenplay); Luke Da...
274,One Night in Miami...,Regina King (director); Kemp Powers (screenpla...
275,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...


In [30]:
!pip install tmdbv3api



In [36]:
from tmdbv3api import TMDb,Movie
import json
import requests
tmdb = TMDb()
tmdb.api_key = 'c28b1759fd2337a50743113e8e107860'

In [37]:

movie = Movie()

def get_genre(title):
    try:
        # Search for the movie by title
        search_results = movie.search(title)
        
        if search_results:
            movie_id = search_results[0].id
            response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}')
            movie_data = response.json()
            
            # Extract genres from the movie data
            genres = [genre['name'] for genre in movie_data['genres']]
            return genres
        else:
            return np.NaN  # Return NaN if movie is not found
    except Exception as e:
        print(f"Error getting genre for {title}: {e}")
        return np.NaN  # Return NaN in case of any error


In [None]:
df_2020['genres'] = df_2020['Title'].map(lambda x: get_genre(str(x)))

In [53]:
df_2020

Unnamed: 0,movie_title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,"[Horror, Mystery, Thriller]",Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,Underwater,"William Eubank (director); Brian Duffield, Ada...","[Horror, Science Fiction, Action, Adventure]",William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",[Comedy],Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,[Drama],Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,"[Crime, Thriller, Drama]",Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs
...,...,...,...,...,...,...,...
272,We Can Be Heroes,Robert Rodriguez (director/screenplay); Priyan...,"[Family, Action, Fantasy, Comedy]",Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin
273,News of the World,Paul Greengrass (director/screenplay); Luke Da...,"[Drama, Western, Adventure]",Paul Greengrass,Tom Hanks,Helena Zengel,
274,One Night in Miami...,Regina King (director); Kemp Powers (screenpla...,[Drama],Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge
275,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...,"[Thriller, Crime, Drama]",Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie


In [54]:
def get_director(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (directors)" in x:
        return x.split(" (directors)")[0]
    else:
        return x.split(" (director/screenplay)")[0]

In [55]:
df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director(str(x)))

In [56]:
def get_actor1(x):
    return ((x.split("screenplay); ")[-1]).split(", ")[0])

In [57]:
df_2020['actor_1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1(str(x)))

In [58]:
def get_actor2(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[1])

In [59]:
df_2020['actor_2_name'] = df_2020['Cast and crew'].map(lambda x: get_actor2(str(x)))

In [60]:
def get_actor3(x):
    if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
        return np.NaN
    else:
        return ((x.split("screenplay); ")[-1]).split(", ")[2])

In [61]:
df_2020['actor_3_name'] = df_2020['Cast and crew'].map(lambda x: get_actor3(str(x)))

In [62]:
df_2020

Unnamed: 0,movie_title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,The Grudge,Nicolas Pesce (director/screenplay); Andrea Ri...,"[Horror, Mystery, Thriller]",Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho
1,Underwater,"William Eubank (director); Brian Duffield, Ada...","[Horror, Science Fiction, Action, Adventure]",William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick
2,Like a Boss,"Miguel Arteta (director); Sam Pitman, Adam Col...",[Comedy],Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek
3,Three Christs,Jon Avnet (director/screenplay); Eric Nazarian...,[Drama],Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins
4,Inherit the Viper,Anthony Jerjen (director); Andrew Crabtree (sc...,"[Crime, Thriller, Drama]",Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs
...,...,...,...,...,...,...,...
272,We Can Be Heroes,Robert Rodriguez (director/screenplay); Priyan...,"[Family, Action, Fantasy, Comedy]",Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin
273,News of the World,Paul Greengrass (director/screenplay); Luke Da...,"[Drama, Western, Adventure]",Paul Greengrass,Tom Hanks,Helena Zengel,
274,One Night in Miami...,Regina King (director); Kemp Powers (screenpla...,[Drama],Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge
275,Promising Young Woman,Emerald Fennell (director/screenplay); Carey M...,"[Thriller, Crime, Drama]",Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie


In [63]:
df_2020 = df_2020.rename(columns={'Title':'movie_title'})

In [64]:
new_df20 = df_2020.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]

In [65]:
new_df20

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title
0,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho,"[Horror, Mystery, Thriller]",The Grudge
1,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick,"[Horror, Science Fiction, Action, Adventure]",Underwater
2,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek,[Comedy],Like a Boss
3,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins,[Drama],Three Christs
4,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs,"[Crime, Thriller, Drama]",Inherit the Viper
...,...,...,...,...,...,...
272,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,"[Family, Action, Fantasy, Comedy]",We Can Be Heroes
273,Paul Greengrass,Tom Hanks,Helena Zengel,,"[Drama, Western, Adventure]",News of the World
274,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,[Drama],One Night in Miami...
275,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,"[Thriller, Crime, Drama]",Promising Young Woman


In [67]:
new_df20['comb'] = new_df20['actor_1_name'].astype(str) + ' ' + \
                    new_df20['actor_2_name'].astype(str) + ' ' + \
                    new_df20['actor_3_name'].astype(str) + ' ' + \
                    new_df20['director_name'].astype(str) + ' ' + \
                    new_df20['genres'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

In [68]:
new_df20.isna().sum()

director_name     0
actor_1_name      0
actor_2_name      5
actor_3_name     28
genres            2
movie_title       0
comb              0
dtype: int64

In [69]:
new_df20 = new_df20.dropna(how='any')

In [70]:
new_df20.isna().sum()

director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
genres           0
movie_title      0
comb             0
dtype: int64

In [71]:
new_df20['movie_title'] = new_df20['movie_title'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df20['movie_title'] = new_df20['movie_title'].str.lower()


In [72]:
new_df20

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Nicolas Pesce,Andrea Riseborough,Demián Bichir,John Cho,"[Horror, Mystery, Thriller]",the grudge,Andrea Riseborough Demián Bichir John Cho Nico...
1,William Eubank,Kristen Stewart,Vincent Cassel,Jessica Henwick,"[Horror, Science Fiction, Action, Adventure]",underwater,Kristen Stewart Vincent Cassel Jessica Henwick...
2,Miguel Arteta,Tiffany Haddish,Rose Byrne,Salma Hayek,[Comedy],like a boss,Tiffany Haddish Rose Byrne Salma Hayek Miguel ...
3,Jon Avnet,Richard Gere,Peter Dinklage,Walton Goggins,[Drama],three christs,Richard Gere Peter Dinklage Walton Goggins Jon...
4,Anthony Jerjen,Josh Hartnett,Margarita Levieva,Chandler Riggs,"[Crime, Thriller, Drama]",inherit the viper,Josh Hartnett Margarita Levieva Chandler Riggs...
...,...,...,...,...,...,...,...
271,Pete Docter,Jamie Foxx,Tina Fey,Graham Norton,"[Animation, Family, Comedy, Fantasy]",soul,Jamie Foxx Tina Fey Graham Norton Pete Docter ...
272,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,"[Family, Action, Fantasy, Comedy]",we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...
274,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,[Drama],one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...
275,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,"[Thriller, Crime, Drama]",promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...


In [73]:
old_df = pd.read_csv('final_data.csv')

In [74]:
old_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,pirates of the caribbean: dead men tell no tales,Johnny Depp Javier Bardem Geoffrey Rush Joachi...
1,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,justice league,Ben Affleck Henry Cavill Gal Gadot Zack Snyder...
2,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,thor: ragnarok,Chris Hemsworth Tom Hiddleston Cate Blanchett ...
3,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,Action Adventure Comedy Sci-Fi,guardians of the galaxy vol. 2,Chris Pratt Zoe Saldana Dave Bautista James Gu...
4,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Fantasy Action Adventure,the king's daughter,Pierce Brosnan William Hurt Benjamin Walker Se...
...,...,...,...,...,...,...,...
952,"Nick Bruno, Troy Quane",Will Smith,Tom Holland,Rashida Jones,Animation Action Adventure Comedy Family,spies in disguise,Will Smith Tom Holland Rashida Jones Nick Brun...
953,Greta Gerwig,Saoirse Ronan,Emma Watson,Florence Pugh,Drama Romance History,little women,Saoirse Ronan Emma Watson Florence Pugh Greta ...
954,Sam Mendes,George MacKay,Dean-Charles Chapman,Mark Strong,War History Thriller Drama,1917,George MacKay Dean-Charles Chapman Mark Strong...
955,Destin Daniel Cretton,Michael B. Jordan,Jamie Foxx,Brie Larson,Drama Crime History,just mercy,Michael B. Jordan Jamie Foxx Brie Larson Desti...


In [76]:
final_df = pd.concat([old_df, new_df20], ignore_index=True)

In [77]:
final_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,Joachim Rønning Espen Sandberg,Johnny Depp,Javier Bardem,Geoffrey Rush,Adventure Action Fantasy Comedy,pirates of the caribbean: dead men tell no tales,Johnny Depp Javier Bardem Geoffrey Rush Joachi...
1,Zack Snyder,Ben Affleck,Henry Cavill,Gal Gadot,Action Adventure Fantasy Sci-Fi,justice league,Ben Affleck Henry Cavill Gal Gadot Zack Snyder...
2,Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Action Adventure Fantasy Sci-Fi,thor: ragnarok,Chris Hemsworth Tom Hiddleston Cate Blanchett ...
3,James Gunn,Chris Pratt,Zoe Saldana,Dave Bautista,Action Adventure Comedy Sci-Fi,guardians of the galaxy vol. 2,Chris Pratt Zoe Saldana Dave Bautista James Gu...
4,Sean McNamara,Pierce Brosnan,William Hurt,Benjamin Walker,Fantasy Action Adventure,the king's daughter,Pierce Brosnan William Hurt Benjamin Walker Se...
...,...,...,...,...,...,...,...
1200,Pete Docter,Jamie Foxx,Tina Fey,Graham Norton,"[Animation, Family, Comedy, Fantasy]",soul,Jamie Foxx Tina Fey Graham Norton Pete Docter ...
1201,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,"[Family, Action, Fantasy, Comedy]",we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...
1202,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,[Drama],one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...
1203,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,"[Thriller, Crime, Drama]",promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...


In [78]:
final_df.to_csv('main_data.csv',index=False)