In [1]:
# Dependencies
import requests
import json
from config import tmdb_key, omdb_api_key_1_yp, omdb_api_key_2_mt, omdb_api_key_3_al, omdb_api_key_4_aj
from pprint import pprint
import pandas as pd
from pathlib import Path
import numpy as np


In [2]:
# Set the number of results per page and the total number of desired results
total_result_per_page = 20
total_wanted_results = 2800
# Calculate the total number of pages needed to retrieve the desired results
total_pages = int(total_wanted_results / total_result_per_page) 
current_page = 1

# Function to fetch data from a specific page using the API
def fetch_page(page_number):
        # Construct the URL with the page number to fetch data from
    url = f"https://api.themoviedb.org/3/discover/movie?&include_video=false&language=en-US&sort_by=popularity.desc&page={page_number}&region=US&with_origin_country=US"
     # Set the required headers for the API request
    headers = {
        "accept": "application/json",
        "Authorization": tmdb_key
    }
    # Make a GET request to the API and retrieve the data in JSON format
    response = requests.get(url, headers=headers).json()
    # Return the results from the response
    return response["results"]

# Initialize an empty list to store all retrieved movies
all_movies = []
#Loop through the range of page numbers to fetch data from each page
for page_number in range(1, total_pages + 1):
    all_movies += fetch_page(page_number)

#Print the total number of movies retrieved
print(len(all_movies))    

#Please uncomment the following if you'd like to check the json outcome
# print(len(response["results"]))
# print(json.dumps(response, indent=4, sort_keys=True))

2800


In [3]:
#Here we created this code to split the all_movies variable call the omdb api 4 times since it has a limitation of 1000 calls per day.

factor = int(len(all_movies) / 4)
#splitted data 
all_movies_splited_0 = all_movies[0:factor] 
all_movies_splited_1 = all_movies[factor:factor * 2]
all_movies_splited_2 = all_movies[factor * 2: factor * 3]
all_movies_splited_3 = all_movies[factor * 3: factor * 4]


In [4]:
# # #collect titles for omdb fetch
# imdb_titles = []
# for imdb_id in all_movies:
#     imdb_titles.append(imdb_id['title'])
# print(len(all_movies))
# imdb_titles 
# # pprint(all_movies[0])

In [5]:
# Function to fetch details of a specific movie using the API, from this details we wanted the finantial information
def fetch_movie_details(movie_id):
    # Construct the URL with the movie ID to fetch details
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    headers = {
        "accept": "application/json",
        "Authorization": tmdb_key
    }
    # Make a GET request to the API and retrieve the movie details in JSON format
    response = requests.get(url, headers=headers).json()
    # Return the movie details from the response
    return response

In [6]:
# Function to fetch omdb data 
def fetch_movie_omdb(movie_title, omdb_api_key):
    url = "http://www.omdbapi.com/"
    # print(url + movie_title)
    # Make a request to the OMDb API for each movie in the list.
    params = {
        't': movie_title,
        'apikey': omdb_api_key
    }
    response = requests.get(url, params=params)
    omdb_data = response.json()
    # Return the omdb data from the response
    return omdb_data

# http://www.omdbapi.com/?i=tt3896198&apikey=cb04c8ea
# http://www.omdbapi.com/?apikey=cb04c8ea&t=Borderlands
# http://www.omdbapi.com/?t=Borderlands&apikey=cb04c8ea

In [None]:
# elements = [all_movies_splited_0,all_movies_splited_1,all_movies_splited_2,all_movies_splited_3]
# api_key_elements = [omdb_api_key_1_yp, omdb_api_key_2_mt, omdb_api_key_3_al, omdb_api_key_4_aj]

elements = [
    {'api_key': omdb_api_key_1_yp, "movies": all_movies_splited_0},
    {'api_key': omdb_api_key_2_mt, "movies": all_movies_splited_1},
    {'api_key': omdb_api_key_3_al, "movies": all_movies_splited_2},
    {'api_key': omdb_api_key_4_aj, "movies": all_movies_splited_3}
]

for element in elements:
    # Iterate through each movie in the list of all movies
    for movie in element['movies']:
        # Fetch details of the current movie using its ID
        movie_details = fetch_movie_details(movie["id"])
        ## Fetch additional movie details(ratings) from the OMDb API using the movie title
        movie_omdb = fetch_movie_omdb(movie['title'], element['api_key'])
        # Update the movie dictionary with the fetched movie details
        movie.update(movie_details)
        movie.update(movie_omdb)

        #Create a key,value pair to determine if movie is financially successful based on revenue and budget comparison
        movie["financial_success"] = movie["revenue"] > movie["budget"]


In [8]:
# Extract rating data one layer up
imdb_rating_name = "Internet Movie Database"
rotten_tomatoes_rating_name = "Rotten Tomatoes"
metacritic_rating_name = 'Metacritic'

for element in elements:
    # Search for ratings
    for movie in element['movies']:
        # Check if "Ratings" key exists
        ratings = movie.get("Ratings", [])
        
        for rating in ratings:
            if imdb_rating_name == rating["Source"]:
                movie["imdb_rating"] = float(rating["Value"].split("/")[0])
            elif rotten_tomatoes_rating_name == rating["Source"]:
                movie["rotten_tomatoes_rating"] = float(rating["Value"].split("%")[0])
            elif metacritic_rating_name == rating["Source"]:
                movie["metacritic_rating"] = float(rating["Value"].split("/")[0])


In [9]:
# checking how many movies we have per group / element
for element in elements:
    print(len(element['movies']))

700
700
700
700


In [10]:
# clear out movies without ratings

# this function filter each movie based in our desired rule
def filter_movies_ratings(movie):
    # here our rule is defined: a movie is only valid for the filter if it has ratings
    return True if len(movie.get("Ratings", [])) > 0 else False

# we go through our groups of movies / elements
for element in elements:
    movies = element['movies']
    # filter and assign/replace the filtered movies
    element['movies'] = list(filter(filter_movies_ratings, movies))

In [11]:
# audit over movies to notice the amount that is left after filtering out the ones that didn't meat the valid criteria.
for element in elements:
    print(len(element['movies']))

671
665
669
669


In [12]:
#Pretty printing our first outcome 
pprint(all_movies_splited_0[3])

{'Actors': 'Ryan Reynolds, Hugh Jackman, Emma Corrin',
 'Awards': '4 wins & 1 nomination',
 'BoxOffice': '$605,188,101',
 'Country': 'United States, United Kingdom, Australia, New Zealand, Canada',
 'DVD': 'N/A',
 'Director': 'Shawn Levy',
 'Genre': 'Action, Adventure, Comedy',
 'Language': 'English, French, Spanish',
 'Metascore': 'N/A',
 'Plot': 'Deadpool is offered a place in the Marvel Cinematic Universe by the '
         'Time Variance Authority, but instead recruits a variant of Wolverine '
         'to save his universe from extinction.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BZmQxZWM5MzgtY2EzZC00OGUxLWE0Y2EtMDIwOTFlNmQ5MWMyXkEyXkFqcGc@._V1_SX300.jpg',
 'Production': 'N/A',
 'Rated': 'R',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.0/10'},
             {'Source': 'Rotten Tomatoes', 'Value': '78%'}],
 'Released': '26 Jul 2024',
 'Response': 'True',
 'Runtime': '128 min',
 'Title': 'Deadpool & Wolverine',
 'Type': 'movie',
 'Website': 'N/A',
 'Write

In [13]:
#Creating a dataframe
def create_dataframe(all_movies_data):
    df = pd.DataFrame(all_movies_data)
    print(df.info())
    return df

all_movies_df_0 = create_dataframe(all_movies_splited_0)
all_movies_df_1 = create_dataframe(all_movies_splited_1)
all_movies_df_2 = create_dataframe(all_movies_splited_2)
all_movies_df_3 = create_dataframe(all_movies_splited_3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 58 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   adult                   700 non-null    bool   
 1   backdrop_path           698 non-null    object 
 2   genre_ids               700 non-null    object 
 3   id                      700 non-null    int64  
 4   original_language       700 non-null    object 
 5   original_title          700 non-null    object 
 6   overview                700 non-null    object 
 7   popularity              700 non-null    float64
 8   poster_path             700 non-null    object 
 9   release_date            700 non-null    object 
 10  title                   700 non-null    object 
 11  video                   700 non-null    bool   
 12  vote_average            700 non-null    float64
 13  vote_count              700 non-null    int64  
 14  belongs_to_collection   382 non-null    ob

In [14]:
#Displaying all columns 
print(all_movies_df_3.columns)

Index(['adult', 'backdrop_path', 'genre_ids', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count',
       'belongs_to_collection', 'budget', 'genres', 'homepage', 'imdb_id',
       'origin_country', 'production_companies', 'production_countries',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'Title',
       'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Writer',
       'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster', 'Ratings',
       'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type', 'DVD',
       'BoxOffice', 'Production', 'Website', 'Response', 'financial_success',
       'imdb_rating', 'rotten_tomatoes_rating', 'metacritic_rating',
       'totalSeasons', 'Error'],
      dtype='object')


In [15]:
#creating a function to selec specific columns from the dataframe
# Selecting specific columns from the DataFrame
def select_specific_columns(dataframe):
    dataframe = dataframe[["imdb_id", "title","release_date", "Runtime", "Genre","overview","Director", "Actors", "Rated","imdbVotes", "popularity", "imdb_rating","rotten_tomatoes_rating", "metacritic_rating", "budget", "revenue", "financial_success"]]
    # Drop rows with any missing values
    dataframe.dropna(inplace=True)
    # Display the length of the DataFrame after dropping rows with missing values
    print(dataframe.columns)
    return dataframe


In [16]:
#Putting previous function into action
# dataframes = [all_movies_df_0,all_movies_df_1,all_movies_df_2,all_movies_df_3]

# for dataframe in dataframes:
    # select_specific_columns(dataframe)
    # display(dataframe.head())
all_movies_df_0 = select_specific_columns(all_movies_df_0)
all_movies_df_1 = select_specific_columns(all_movies_df_1)
all_movies_df_2 = select_specific_columns(all_movies_df_2)
all_movies_df_3 = select_specific_columns(all_movies_df_3)

print(all_movies_df_3.columns)

Index(['imdb_id', 'title', 'release_date', 'Runtime', 'Genre', 'overview',
       'Director', 'Actors', 'Rated', 'imdbVotes', 'popularity', 'imdb_rating',
       'rotten_tomatoes_rating', 'metacritic_rating', 'budget', 'revenue',
       'financial_success'],
      dtype='object')
Index(['imdb_id', 'title', 'release_date', 'Runtime', 'Genre', 'overview',
       'Director', 'Actors', 'Rated', 'imdbVotes', 'popularity', 'imdb_rating',
       'rotten_tomatoes_rating', 'metacritic_rating', 'budget', 'revenue',
       'financial_success'],
      dtype='object')
Index(['imdb_id', 'title', 'release_date', 'Runtime', 'Genre', 'overview',
       'Director', 'Actors', 'Rated', 'imdbVotes', 'popularity', 'imdb_rating',
       'rotten_tomatoes_rating', 'metacritic_rating', 'budget', 'revenue',
       'financial_success'],
      dtype='object')
Index(['imdb_id', 'title', 'release_date', 'Runtime', 'Genre', 'overview',
       'Director', 'Actors', 'Rated', 'imdbVotes', 'popularity', 'imdb_rating',
  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.dropna(inplace=True)


In [17]:
#splitting some columns 
def splitting_columns(dataframe):
    dataframe[['star_1','star_2','star_3']] = dataframe['Actors'].str.split(',', n=2, expand=True)
    dataframe[['genre_1','genre_2','genre_3']] = dataframe['Genre'].str.split(',', n=2, expand=True)
    # Split the Director column
    split_directors = dataframe['Director'].str.split(',', n=1, expand=True)

    # Create new columns with default values
    dataframe['director_1'] = split_directors[0]  # This will always exist
    dataframe['director_2'] = ''  # Initialize with empty strings or NaN

    # If there is a second column, assign it to director_2
    if split_directors.shape[1] > 1:
        dataframe['director_2'] = split_directors[1]

    return(dataframe)

In [18]:

#Putting previous function into action
all_movies_df_0 = splitting_columns(all_movies_df_0)
all_movies_df_1 = splitting_columns(all_movies_df_1)
all_movies_df_2 = splitting_columns(all_movies_df_2)
all_movies_df_3 = splitting_columns(all_movies_df_3)

# all_movies_df_0['Director'].str.split(',', n=1, expand=True)
all_movies_df_1.head()

Unnamed: 0,imdb_id,title,release_date,Runtime,Genre,overview,Director,Actors,Rated,imdbVotes,...,revenue,financial_success,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
1,tt10838180,The Matrix Resurrections,2021-12-16,148 min,"Action, Sci-Fi","Plagued by strange memories, Neo's life takes ...",Lana Wachowski,"Keanu Reeves, Carrie-Anne Moss, Yahya Abdul-Ma...",R,284577,...,156497322,False,Keanu Reeves,Carrie-Anne Moss,Yahya Abdul-Mateen II,Action,Sci-Fi,,Lana Wachowski,
3,tt1192628,Rango,2011-03-02,107 min,"Animation, Action, Adventure","When Rango, a lost family pet, accidentally wi...",Gore Verbinski,"Johnny Depp, Isla Fisher, Timothy Olyphant",PG,296553,...,245700000,True,Johnny Depp,Isla Fisher,Timothy Olyphant,Animation,Action,Adventure,Gore Verbinski,
4,tt15679400,Knock at the Cabin,2023-02-01,100 min,"Horror, Mystery, Thriller","While vacationing at a remote cabin, a young g...",M. Night Shyamalan,"Dave Bautista, Jonathan Groff, Ben Aldridge",R,121112,...,54700000,True,Dave Bautista,Jonathan Groff,Ben Aldridge,Horror,Mystery,Thriller,M. Night Shyamalan,
5,tt1302011,Kung Fu Panda 2,2011-05-25,90 min,"Animation, Action, Adventure",Po and his friends fight to stop a peacock vil...,Jennifer Yuh Nelson,"Jack Black, Angelina Jolie, Jackie Chan",PG,326509,...,665692281,True,Jack Black,Angelina Jolie,Jackie Chan,Animation,Action,Adventure,Jennifer Yuh Nelson,
6,tt27155038,"Ruby Gillman, Teenage Kraken",2023-06-28,91 min,"Animation, Action, Comedy","Ruby Gillman, a sweet and awkward high school ...","Kirk DeMicco, Faryn Pearl","Jane Fonda, Lana Condor, Toni Collette",PG,11541,...,46247409,False,Jane Fonda,Lana Condor,Toni Collette,Animation,Action,Comedy,Kirk DeMicco,Faryn Pearl


In [19]:
# Rename the following columns to keep the same format
all_movies_df_0 = all_movies_df_0.rename(columns={ "Runtime": "runtime", "Genre": "genre","Director": "director", "Actors": "actors", "Rated": "rated", "imdbVotes": "imdb_votes"})
all_movies_df_1 = all_movies_df_1.rename(columns={ "Runtime": "runtime", "Genre": "genre","Director": "director", "Actors": "actors", "Rated": "rated", "imdbVotes": "imdb_votes"})
all_movies_df_2 = all_movies_df_2.rename(columns={ "Runtime": "runtime", "Genre": "genre","Director": "director", "Actors": "actors", "Rated": "rated", "imdbVotes": "imdb_votes"})
all_movies_df_3 = all_movies_df_3.rename(columns={ "Runtime": "runtime", "Genre": "genre","Director": "director", "Actors": "actors", "Rated": "rated", "imdbVotes": "imdb_votes"})
all_movies_df_0    

Unnamed: 0,imdb_id,title,release_date,runtime,genre,overview,director,actors,rated,imdb_votes,...,revenue,financial_success,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2
7,tt0094721,Beetlejuice,1988-03-30,92 min,"Comedy, Fantasy",A newly dead New England couple seeks help fro...,Tim Burton,"Alec Baldwin, Geena Davis, Michael Keaton",PG,348874,...,74849333,True,Alec Baldwin,Geena Davis,Michael Keaton,Comedy,Fantasy,,Tim Burton,
24,tt27682129,Prey,2024-03-15,100 min,"Action, Adventure, Horror",A young couple is compelled to leave their Chr...,Dan Trachtenberg,"Amber Midthunder, Dakota Beavers, Dane DiLiegro",R,233550,...,0,False,Amber Midthunder,Dakota Beavers,Dane DiLiegro,Action,Adventure,Horror,Dan Trachtenberg,
30,tt0295701,xXx,2002-08-09,124 min,"Action, Adventure, Thriller",Xander Cage is your standard adrenaline junkie...,Rob Cohen,"Vin Diesel, Asia Argento, Marton Csokas",PG-13,187525,...,277448382,True,Vin Diesel,Asia Argento,Marton Csokas,Action,Adventure,Thriller,Rob Cohen,
37,tt4154756,Avengers: Infinity War,2018-04-25,149 min,"Action, Adventure, Sci-Fi",As the Avengers and their allies have continue...,"Anthony Russo, Joe Russo","Robert Downey Jr., Chris Hemsworth, Mark Ruffalo",PG-13,1226533,...,2052415039,True,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,Action,Adventure,Sci-Fi,Anthony Russo,Joe Russo
38,tt23778968,Thelma,2024-06-21,116 min,"Drama, Fantasy, Horror",When 93-year-old Thelma Post gets duped by a p...,Joachim Trier,"Eili Harboe, Kaya Wilkins, Henrik Rafaelsen",Not Rated,37145,...,9818454,True,Eili Harboe,Kaya Wilkins,Henrik Rafaelsen,Drama,Fantasy,Horror,Joachim Trier,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693,tt0343660,50 First Dates,2004-02-13,99 min,"Comedy, Drama, Romance",Henry is a player skilled at seducing women. B...,Peter Segal,"Adam Sandler, Drew Barrymore, Rob Schneider",PG-13,387408,...,196482882,True,Adam Sandler,Drew Barrymore,Rob Schneider,Comedy,Drama,Romance,Peter Segal,
694,tt6133466,The First Purge,2018-07-04,97 min,"Action, Horror, Sci-Fi",To push the crime rate below one percent for t...,Gerard McMurray,"Y'lan Noel, Lex Scott Davis, Joivan Wade",R,72101,...,137056262,True,Y'lan Noel,Lex Scott Davis,Joivan Wade,Action,Horror,Sci-Fi,Gerard McMurray,
695,tt2543164,Arrival,2016-11-10,116 min,"Drama, Mystery, Sci-Fi",Taking place after alien crafts land around th...,Denis Villeneuve,"Amy Adams, Jeremy Renner, Forest Whitaker",PG-13,783788,...,203388186,True,Amy Adams,Jeremy Renner,Forest Whitaker,Drama,Mystery,Sci-Fi,Denis Villeneuve,
696,tt1528100,Exodus: Gods and Kings,2014-12-03,150 min,"Action, Adventure, Drama",The defiant leader Moses rises up against the ...,Ridley Scott,"Christian Bale, Joel Edgerton, Ben Kingsley",PG-13,180640,...,268031828,True,Christian Bale,Joel Edgerton,Ben Kingsley,Action,Adventure,Drama,Ridley Scott,


In [20]:
# Format the release_date column to datetime forma
dataframes = [all_movies_df_0,all_movies_df_1,all_movies_df_2,all_movies_df_3]

for dataframe in dataframes:
    dataframe['release_year'] = pd.to_datetime(dataframe['release_date']).dt.year
    dataframe['imdb_rating'] = dataframe['imdb_rating'].astype('float')
    dataframe['rotten_tomatoes_rating'] = dataframe['rotten_tomatoes_rating'].astype('float')
    dataframe['metacritic_rating'] = dataframe['metacritic_rating'].astype('float')
    dataframe['financial_success'] = dataframe['financial_success'].astype('bool')

all_movies_df_2.head()

Unnamed: 0,imdb_id,title,release_date,runtime,genre,overview,director,actors,rated,imdb_votes,...,financial_success,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2,release_year
2,tt2011159,No Good Deed,2014-09-10,84 min,"Crime, Drama, Horror","Terri is a devoted wife and mother of two, liv...",Sam Miller,"Taraji P. Henson, Idris Elba, Leslie Bibb",PG-13,22194,...,True,Taraji P. Henson,Idris Elba,Leslie Bibb,Crime,Drama,Horror,Sam Miller,,2014
6,tt3110958,Now You See Me 2,2016-06-02,129 min,"Action, Adventure, Comedy",One year after outwitting the FBI and winning ...,Jon M. Chu,"Jesse Eisenberg, Mark Ruffalo, Woody Harrelson",PG-13,325510,...,True,Jesse Eisenberg,Mark Ruffalo,Woody Harrelson,Action,Adventure,Comedy,Jon M. Chu,,2016
7,tt0493464,Wanted,2008-06-19,110 min,"Action, Crime, Thriller",Doormat Wesley Gibson is an office worker whos...,Timur Bekmambetov,"Angelina Jolie, James McAvoy, Morgan Freeman",R,411856,...,True,Angelina Jolie,James McAvoy,Morgan Freeman,Action,Crime,Thriller,Timur Bekmambetov,,2008
8,tt2024544,12 Years a Slave,2013-10-18,134 min,"Biography, Drama, History","In the pre-Civil War United States, Solomon No...",Steve McQueen,"Chiwetel Ejiofor, Michael Kenneth Williams, Mi...",R,747876,...,True,Chiwetel Ejiofor,Michael Kenneth Williams,Michael Fassbender,Biography,Drama,History,Steve McQueen,,2013
9,tt1615918,Alvin and the Chipmunks: Chipwrecked,2011-12-14,87 min,"Adventure, Comedy, Family","Playing around while aboard a cruise ship, the...",Mike Mitchell,"Justin Long, Matthew Gray Gubler, Jesse McCartney",G,40342,...,True,Justin Long,Matthew Gray Gubler,Jesse McCartney,Adventure,Comedy,Family,Mike Mitchell,,2011


In [21]:
#Dropping unnecesary columns
all_movies_df_0.drop(columns=['release_date','genre','director','actors'], inplace=True)
all_movies_df_1.drop(columns=['release_date','genre','director','actors'], inplace=True)
all_movies_df_2.drop(columns=['release_date','genre','director','actors'], inplace=True)
all_movies_df_3.drop(columns=['release_date','genre','director','actors'], inplace=True)


In [22]:
all_movies_df_2.dtypes

imdb_id                    object
title                      object
runtime                    object
overview                   object
rated                      object
imdb_votes                 object
popularity                float64
imdb_rating               float64
rotten_tomatoes_rating    float64
metacritic_rating         float64
budget                      int64
revenue                     int64
financial_success            bool
star_1                     object
star_2                     object
star_3                     object
genre_1                    object
genre_2                    object
genre_3                    object
director_1                 object
director_2                 object
release_year                int32
dtype: object

In [23]:
print(all_movies_df_2['financial_success'].dtype)

bool


In [24]:
# outcome
# rules: financial_success == true && imdb_rating > 7 && rotten_tomatoes_rating > 60 && metacritic_rating > 50

# Create a mask for each condition
mask_financial_success = all_movies_df_0['financial_success'] == True
mask_imdb_rating = all_movies_df_0['imdb_rating'] > 6.0
mask_rotten_tomatoes_rating = all_movies_df_0['rotten_tomatoes_rating'] > 60
mask_metacritic_rating = all_movies_df_0['metacritic_rating'] > 60
all_movies_df_0['outcome'] = mask_financial_success & mask_imdb_rating & mask_rotten_tomatoes_rating & mask_metacritic_rating

mask_financial_success = all_movies_df_1['financial_success'] == True
mask_imdb_rating = all_movies_df_1['imdb_rating'] > 6.0
mask_rotten_tomatoes_rating = all_movies_df_1['rotten_tomatoes_rating'] > 60
mask_metacritic_rating = all_movies_df_1['metacritic_rating'] > 60
all_movies_df_1['outcome'] = mask_financial_success & mask_imdb_rating & mask_rotten_tomatoes_rating & mask_metacritic_rating

mask_financial_success = all_movies_df_2['financial_success'] == True
mask_imdb_rating = all_movies_df_2['imdb_rating'] > 6.0
mask_rotten_tomatoes_rating = all_movies_df_2['rotten_tomatoes_rating'] > 60
mask_metacritic_rating = all_movies_df_2['metacritic_rating'] > 60
all_movies_df_2['outcome'] = mask_financial_success & mask_imdb_rating & mask_rotten_tomatoes_rating & mask_metacritic_rating

mask_financial_success = all_movies_df_3['financial_success'] == True
mask_imdb_rating = all_movies_df_3['imdb_rating'] > 6.0
mask_rotten_tomatoes_rating = all_movies_df_3['rotten_tomatoes_rating'] > 60
mask_metacritic_rating = all_movies_df_3['metacritic_rating'] > 60
all_movies_df_3['outcome'] = mask_financial_success & mask_imdb_rating & mask_rotten_tomatoes_rating & mask_metacritic_rating


# mask_financial_success = all_movies_df_2['financial_success'] == True
# mask_imdb_rating = all_movies_df_2['imdb_rating'] > 6.0
# mask_rotten_tomatoes_rating = all_movies_df_2['rotten_tomatoes_rating'] > 60
# mask_metacritic_rating = all_movies_df_2['metacritic_rating'] > 60
# mask_imdb_rating

In [25]:
# all_movies_df_2.drop(columns=['financial_success', 'imdb_rating', 'rotten_tomatoes_rating','metacritic_rating'], inplace=True)
# all_movies_df_2.head()

In [26]:
#display dataframe
all_movies_df_2.head()

Unnamed: 0,imdb_id,title,runtime,overview,rated,imdb_votes,popularity,imdb_rating,rotten_tomatoes_rating,metacritic_rating,...,star_1,star_2,star_3,genre_1,genre_2,genre_3,director_1,director_2,release_year,outcome
2,tt2011159,No Good Deed,84 min,"Terri is a devoted wife and mother of two, liv...",PG-13,22194,37.883,5.6,12.0,26.0,...,Taraji P. Henson,Idris Elba,Leslie Bibb,Crime,Drama,Horror,Sam Miller,,2014,False
6,tt3110958,Now You See Me 2,129 min,One year after outwitting the FBI and winning ...,PG-13,325510,33.684,6.4,33.0,46.0,...,Jesse Eisenberg,Mark Ruffalo,Woody Harrelson,Action,Adventure,Comedy,Jon M. Chu,,2016,False
7,tt0493464,Wanted,110 min,Doormat Wesley Gibson is an office worker whos...,R,411856,37.809,6.7,71.0,64.0,...,Angelina Jolie,James McAvoy,Morgan Freeman,Action,Crime,Thriller,Timur Bekmambetov,,2008,True
8,tt2024544,12 Years a Slave,134 min,"In the pre-Civil War United States, Solomon No...",R,747876,37.78,8.1,95.0,96.0,...,Chiwetel Ejiofor,Michael Kenneth Williams,Michael Fassbender,Biography,Drama,History,Steve McQueen,,2013,True
9,tt1615918,Alvin and the Chipmunks: Chipwrecked,87 min,"Playing around while aboard a cruise ship, the...",G,40342,37.673,4.4,10.0,24.0,...,Justin Long,Matthew Gray Gubler,Jesse McCartney,Adventure,Comedy,Family,Mike Mitchell,,2011,False


In [28]:
#Saving our DataFrame into 4 different csvs.
all_movies_df_0.to_csv("Resources/movies_dataset_0.csv", index=False)
all_movies_df_1.to_csv("Resources/movies_dataset_1.csv", index=False)
all_movies_df_2.to_csv("Resources/movies_dataset_2.csv", index=False)
all_movies_df_3.to_csv("Resources/movies_dataset_3.csv", index=False)