In [27]:
import pandas as pd
import requests
import warnings
warnings.filterwarnings('ignore')
from config import omdb_key
from config import tmdb_key
import datetime

In [2]:
# https://www.kaggle.com/abhimanyudasarwar/netflix-originals

file_path = 'Resources/netflix_originals.csv'

In [3]:
netflix_titles_df = pd.read_csv(file_path)

print(netflix_titles_df.columns)
netflix_titles_df.head(2)

Index(['Title', 'Genre', 'Original Network', 'Premiere', 'Seasons', 'Length',
       'Netflix Exclusive Regions', 'Status'],
      dtype='object')


Unnamed: 0,Title,Genre,Original Network,Premiere,Seasons,Length,Netflix Exclusive Regions,Status
0,House of Cards,Political drama,Netflix,"February 1, 2013","6 seasons, 73 episodes",42–59 min.,Worldwide,Ended
1,Hemlock Grove,Horror,Netflix,"April 19, 2013","3 seasons, 33 episodes",45–58 min.,Worldwide,Ended


In [39]:
netflix_titles_clean = netflix_titles_df[['Title', 'Genre', 'Premiere', 'Seasons']]

list_genre =('Making-of', 'Aftershow / Interview','Musical / Short')

# remove 1 rouge string value in year
netflix_titles_clean = netflix_titles_clean[netflix_titles_clean['Seasons'].isnull()]
netflix_titles_clean = netflix_titles_clean[netflix_titles_clean['Genre'] != list_genre]
netflix_titles_clean = netflix_titles_clean[~netflix_titles_clean['Premiere'].str.contains('2020', na=False)]
netflix_fix_titles = netflix_titles_clean.reset_index()

print(netflix_fix_titles.Title.count())
netflix_fix_titles.head(2)

568


Unnamed: 0,index,Title,Genre,Premiere,Seasons
0,497,Beasts of No Nation,War drama,"October 16, 2015",
1,498,The Ridiculous 6,Western,"December 11, 2015",


In [5]:
# variable cause starswith() wasn't happy with '#'
pound_sign = '#'

# make dataframe for pound sign = True (startswith() returns True/False)
replace_pound_df = netflix_fix_titles.iloc[:, 0:2]
replace_pound_df.Title = replace_pound_df.Title.str.startswith(pound_sign)

# make df for ONLY the True values + primaryTitle from movie_df
pound_true_df = replace_pound_df.loc[replace_pound_df.Title == True]
pound_true_df['TITLE'] = netflix_fix_titles['Title']

# Fix titles to not have # in the front & clean up columns
pound_true_df['TITLE'] = pound_true_df['TITLE'].str.replace(pound_sign, "")
pound_true_clean_df = pound_true_df.drop(columns=['Title'])
pound_true_clean_df = pound_true_clean_df.rename(columns={'TITLE': 'Title'})

# Merge 2 dfs, replace blank primaryTitle_y values with na so you can do fillna into a 
# nice new clean has correct info column & delete primaryTitle_y/x
titles_combined_df = pd.merge(netflix_fix_titles, pound_true_clean_df, how='outer', on='index')
titles_combined_df['Title_y'] = titles_combined_df['Title_y'].str.replace(" ", "nan")
titles_combined_df["Title"] = titles_combined_df["Title_y"].fillna(titles_combined_df["Title_x"])
titles_fixed_df = titles_combined_df.drop(columns=['Title_y', 'Title_x'])

print(titles_fixed_df.Title.count())
titles_fixed_df.head(2)

568


Unnamed: 0,index,Genre,Premiere,Seasons,Title
0,497,War drama,"October 16, 2015",,Beasts of No Nation
1,498,Western,"December 11, 2015",,The Ridiculous 6


In [8]:
beginning_number = titles_fixed_df['index'].count()

titles_fixed_df['Title'] = titles_fixed_df['Title'].str.replace(" ", "+")
            
print(f"\nTHIS IS HOW MANY MOVIES: {titles_fixed_df.Title.count()}")
titles_fixed_df.head(2)


THIS IS HOW MANY MOVIES: 568


Unnamed: 0,index,Genre,Premiere,Seasons,Title
0,497,War drama,"October 16, 2015",,Beasts+of+No+Nation
1,498,Western,"December 11, 2015",,The+Ridiculous+6


# OMDB

In [9]:
# make URL
url_omdb = "http://www.omdbapi.com/?apikey="+ omdb_key + "&t="

#make OMDB dataframe
OMDB_netflix_df = netflix_titles_clean.copy()
OMDB_netflix_df['Metascore'] = ''
OMDB_netflix_df['imdbRating'] = ''
OMDB_netflix_df['imdbVotes'] = ''
OMDB_netflix_df['Title_from_OMDB'] = ''
OMDB_netflix_df['imdbID'] = ''


error_count = 0

for index, row in OMDB_netflix_df.iterrows():           
    try:
        movie_data = requests.get(url_omdb + str(OMDB_netflix_df.Title[index])).json()
        try:
            OMDB_netflix_df.loc[index, 'Metascore'] = movie_data['Metascore']
            OMDB_netflix_df.loc[index, 'imdbRating'] = movie_data['imdbRating']
            OMDB_netflix_df.loc[index, 'imdbVotes'] = movie_data['imdbVotes']
            OMDB_netflix_df.loc[index, 'Title_from_OMDB'] = movie_data['Title']
            OMDB_netflix_df.loc[index, 'imdbID'] = movie_data['imdbID']

        except (IndexError, KeyError, ValueError):
            error_count +=1
    except(ValueError, KeyError, IndexError):
        error_count +=1      

In [10]:
OMDB_netflix_df.head(2)

Unnamed: 0,Title,Genre,Premiere,Seasons,Metascore,imdbRating,imdbVotes,Title_from_OMDB,imdbID
497,Beasts of No Nation,War drama,"October 16, 2015",,79,7.7,71861,Beasts of No Nation,tt1365050
498,The Ridiculous 6,Western,"December 11, 2015",,18,4.8,43592,The Ridiculous 6,tt2479478


#### Save results as a CSV

In [11]:
file_outpath = f"Resources/OMDB_pull_Netflix_error_count{error_count}.csv"

OMDB_netflix_df.to_csv(file_outpath)

# TMDB call 1 - get TMDB ID numbers

In [40]:
# Make movie titles into a list so you can run it in TMDB pull 1
movies = titles_fixed_df['Title'].tolist()

In [72]:
imdb_id_url =  "https://api.themoviedb.org/3/movie/"

response_tmdb_id = []

error_count_TMDB = 0

for index, row in OMDB_netflix_df.iterrows():  
    movie_data = requests.get(imdb_id_url + OMDB_netflix_df.imdbID[index] + "/external_ids?api_key=" + tmdb_key).json()
    try:
        response_tmdb_id.append(movie_data['id']) 
    except:
        error_count_TMDB += 1    
        

#### Save results as a CSV

In [74]:
file_outpath = f"Resources/TMDB_pull_1_Netflix_error_count{error_count_TMDB}.csv"

TMDB_df = pd.DataFrame(response_tmdb_id,columns=['ID'],dtype=object)
TMDB_df.to_csv(file_outpath)

# Now take ID #s and run through TMDB

In [79]:
#Make TMDB dataframe
TMDB_netflix_df = TMDB_df.copy()
TMDB_netflix_df.head(2)

Unnamed: 0,ID
0,283587
1,347969


In [80]:
url_tmdb_movie = "https://api.themoviedb.org/3/movie/"

# Make columns to import info into
TMDB_netflix_df['imdb_id'] = " "
TMDB_netflix_df['release_date'] = " "
TMDB_netflix_df['budget'] = " "
TMDB_netflix_df['revenue'] = " "
TMDB_netflix_df['genres'] = " "
TMDB_netflix_df['original_language'] = " "
TMDB_netflix_df['original_title'] = " "
TMDB_netflix_df['origin_country'] = " "
TMDB_netflix_df['production_countries name'] = " "
TMDB_netflix_df['spoken_languages name'] = " "

TMDB_netflix_df.head(2)

Unnamed: 0,ID,imdb_id,release_date,budget,revenue,genres,original_language,original_title,origin_country,production_countries name,spoken_languages name
0,283587,,,,,,,,,,
1,347969,,,,,,,,,,


In [81]:
error_count_info = 0

for index, row in TMDB_df.iterrows(): 
    movie_data = requests.get(url_tmdb_movie + str(TMDB_df.ID[index]) + "?api_key=" + tmdb_key).json()
    try:
        TMDB_netflix_df.loc[index, 'imdb_id'] = movie_data['imdb_id']
        TMDB_netflix_df.loc[index, 'release_date'] = movie_data['release_date']
        TMDB_netflix_df.loc[index, 'budget'] = movie_data['budget']
        TMDB_netflix_df.loc[index, 'revenue'] = movie_data['revenue']
        TMDB_netflix_df.loc[index, 'original_language'] = movie_data['spoken_languages'][0]['name']
        TMDB_netflix_df.loc[index, 'original_title'] = movie_data['original_title']
        TMDB_netflix_df.loc[index, 'origin_country'] = movie_data['production_countries'][0]['iso_3166_1']
        TMDB_netflix_df.loc[index, 'production_countries name'] = movie_data['production_countries'][0]['name']
        TMDB_netflix_df.loc[index, 'spoken_languages name'] = movie_data['spoken_languages'][0]['name']
        TMDB_netflix_df.loc[index, 'genres'] = movie_data['genres'][0]['name']    
    except (IndexError, KeyError):
        error_count_info +=1

#### Save results as a CSV

In [82]:
file_outpath_2 = f"Resources/TMDB_pull_2_Netflix_error_count{error_count_info}.csv"

TMDB_netflix_df.to_csv(file_outpath_2)

# CLEANING if budget = 0, revenue = 0, IMDB_id not found
* This is to help keep the file size down by dropping rows we cannot use or cannot match up

In [83]:
movie_info_pulled_df = TMDB_netflix_df.copy()
movie_info_pulled_df.head(2)

Unnamed: 0,ID,imdb_id,release_date,budget,revenue,genres,original_language,original_title,origin_country,production_countries name,spoken_languages name
0,283587,tt1365050,2015-09-11,6000000,9077700,Drama,English,Beasts of No Nation,GH,Ghana,English
1,347969,tt2479478,2015-12-11,60000000,0,Comedy,English,The Ridiculous 6,US,United States of America,English


In [84]:
movie_info_pulled_df = movie_info_pulled_df[movie_info_pulled_df.budget != 0]
movie_info_pulled_df = movie_info_pulled_df[movie_info_pulled_df.revenue != 0]
movie_info_pulled_df = movie_info_pulled_df.dropna(subset=['imdb_id'])

final_number = movie_info_pulled_df.imdb_id.count()

#### Save results as a CSV

In [85]:
total_errors = beginning_number - final_number

file_outpath_FINAL = f"Resources/TMDB_pull_FINAL_Netflix_dropped_movies_{total_errors}.csv"
movie_info_pulled_df.to_csv(file_outpath_FINAL)