### Import Dependencies

In [1]:
# Import the requests library and API key
import pandas as pd
import requests
from config import tmdb_api_key

### Create starting URLs for the APIs

In [2]:
# Starting URL for TMDB API discover method including the API key from config.py. Using the language (English),
# region (US), release date start (01-01-2000) and end (12-31-2022), sort by (release date descending) parameters.
# Not included are the "page" and "certification" parameters which will be included in the function later below.
url = "https://api.themoviedb.org/3/discover/movie?api_key=" + tmdb_api_key + "&language=en-US&region=US&sort_by=primary_release_date.desc&certification_country=US&include_adult=false&include_video=false&primary_release_date.gte=2000-01-01&primary_release_date.lte=2022-03-01&with_original_language=en"

In [3]:
# Create URL for movies tagged with keywords "LGBT, gay, lesbian, trans, transgender, and gay teen."
lgbt_url = url + "&with_keywords=158718%7C264384%7C264386%7C273637%7C290527%7C163037"

### Create Functions to Gather Info from APIs

In [4]:
# Create a function that can take different URLs that you input to iterate through multiple pages of 
# the API call and to constrain the search to particular certification ratings. Specifying the params 
# for params with a dictionary containing the key/value pair for "page" and "certification".

def get_movies(url_name, page_num, rating):
    movies = requests.get(
        url_name,
        params={
            "page": page_num,
            "certification": rating}
    )
    # Create a variable to hold the JSON text of the 'get' request
    movies_data = movies.json()
    # Create a Pandas Dataframe for 'results' from the JSON dictionary
    return pd.DataFrame(movies_data['results'])

In [5]:
# Create a function that adds ratings into each individual rating dataframe.
def isolate_ratings(rating_df, rating):
    rating_df['rating'] = rating
    rating_df = rating_df.loc[:, ('id', 'rating')]
    return rating_df

In [6]:
# Create a function to get the additional movie information for every individual movie ID in the movies dataframe.

def get_info(movie_df):
    def gather_info(movie_url,id_list):
    #     Create a function to input each individual movie ID into the movie_id_url.
        def get_id_url(id_index):
            movie_id_url = "https://api.themoviedb.org/3/movie/" + str(id_list[id_index]) + "?api_key=" + tmdb_api_key
            return movie_id_url
    #     Create a list of movie URLs based on each movie's ID.
        movie_id_urls = [get_id_url(n) for n in range(len(id_list))]
        info_request = requests.get(movie_id_urls[movie_url])
        info_data = info_request.json()
        return pd.json_normalize(info_data)
    
    # Create a list of just the movie IDs so it can be called upon.
    id_list = movie_df['id'].tolist()
    
    # Get each movie's additional info by calling the function where n is the index of the ids list
    # and lgbts_ids is each movie's individual id.
    info = [gather_info(n, id_list) for n in range(len(id_list))]

    # Combine each movie's additional info dataframe into one.
    info_df = pd.concat(info)
    return info_df.reset_index().drop(columns='index')

In [7]:
# Create a funtion to combine all previous API dataframes into one. This includes the ratings info, the movie's info
# and the genre names based on genre IDs.
def combine_clean_data(main_df, rating_df_list, info_df):
    
    # Create a nondestructive copy of the main_df
    movie_df = main_df.set_index('id')
    
    # Ensure that there are no "\r" substrings in the string of the "overview" column. This avoids formatting
    # issues when the dataframe is exported to a CSV
    movie_df['overview'] = movie_df['overview'].str.replace("\r", "")
    
    # Join the individual ratings dataframes from the "isolate_ratings" function into a single dataframe.
    ratings_df = pd.concat(rating_df_list).set_index('id')

    # Join the ratings dataframe into the movies dataframe to add the certification rating to the movies dataframe.
    movie_df = movie_df.join(ratings_df, on='id', how='left').reset_index()
    # Drop duplicates of any films. We keep the 'last' duplicate in order to ensure the film has a rating vs NR.
    movie_df = movie_df.drop_duplicates(subset=['id'], keep='last').reset_index().drop(columns='index')
    
    # Swap null values in ratings column for "NR-Manual" to indicate that these films did not have any certification
    # rating label -- not even a proper "NR".
    movie_df['rating'].fillna('NR-Manual', inplace=True)
    
    # Add genre names
    # Create genre_url variable.
    genre_url = "https://api.themoviedb.org/3/genre/movie/list?api_key=" + tmdb_api_key + "&language=en-US"
    
    # Request the genre JSON info into a list.
    genre_request = requests.get(genre_url)
    genre_data = genre_request.json()
    genre_info = genre_data['genres']
    
    # Transform the list of dictionaries of genre info into a individual lists of the genre IDs and genre names.
    ids = [i['id'] for i in genre_info]
    names = [n['name'] for n in genre_info]
    
    # Duplicate id and genre_ids column from the main movie_df into new dataframe.
    genres_df = pd.DataFrame()
    genres_df[['id', 'genres']] = movie_df[['id','genre_ids']]
    
    # Find the list that has the max length of the genres column in order to create the correct number of
    # columns in sep_genres_df.
    length_of_genres = pd.DataFrame(genres_df['genres'].str.len())
    len_genres = length_of_genres['genres'].to_list()
    len_genres = max(len_genres)
    
    # Create a nondestructive dataframe to separate the lists of genres in the genre column to hold one individual 
    # genre per column.
    sep_genres_df = pd.DataFrame(
        genres_df['genres'].to_list(), columns=[n for n in range(len_genres)])
    
    # Replace all genre integer values for "genre_ids" to their associated string in genres_df.
    sep_genres_df = sep_genres_df.replace(ids, names)
    # Create a new column that puts all genre names in every column of the dataframe into a list per movie.
    sep_genres_df['genres'] = sep_genres_df.values.tolist()
    
    # Remove the null values from the list.
    sep_genres_df['genres'] = sep_genres_df.genres.apply(lambda x: [name for name in x if not pd.isnull(name)])
    
    # Drop individual genre columns.
    sep_genres_df = sep_genres_df[['genres']]
    
    # Take the movie IDs from the genre_df and copy them into the nondestructive dataframe sep_genre_df.
    sep_genres_df['id'] = genres_df['id']
    # Set the index for sep_genre_df to the movie's id.
    sep_genres_df = sep_genres_df.set_index('id')
    
    # Join genres column from sep_genres_df into the main movie_df
    movie_df = movie_df.join(sep_genres_df, on='id', how='left')
    
    # Create a new dataframe that takes only the necessary information from the info_df gathered using 
    # the "get_info" function.
    add_info_df = info_df[['id', 'budget', 'imdb_id', 'revenue', 'runtime', 'status']].set_index('id')
    # Join the additional info to the main movie_df.
    movie_df = movie_df.join(add_info_df, on='id', how='left')
    
    # Convert release_date column to datetime
    movie_df['release_date'] = pd.to_datetime(movie_df['release_date'])

    # Sort movies_df by release date.
    movie_df = movie_df.sort_values(by='release_date',ascending=False).reset_index().drop(columns='index')
    
    # Reorder columns.
    new_columns = [
        'id', 'original_title', 'imdb_id', 'genre_ids', 'genres', 'rating', 'overview', 'popularity',
        'release_date','budget', 'revenue', 'runtime', 'status', 'title']
    return movie_df[new_columns]
    

# Part 1: LGBT Movies

### Create Dataframes of Only LGBT Movies

In [8]:
# The number of pages you want. (Note: the max number of pages for the ratings comes from the total amount of
# pages that the API call will iterate through to get all movies regardless of their certification rating).
pages_requested = 67

# Get all pages for each US rating as dataframes by calling the function "get_movies()" 
# where "page_num" is variable n + 1. The list comprehension will iterate through the range 
# of "pages_requested" which is 12, adding 1 to each iteration to make the count 1-12. The get_ratings function
# uses the second variable to specify the rating needed in the URL parameter.
pages_lgbt = [get_movies(lgbt_url, n + 1, "") for n in range(pages_requested)]
pages_nr_lgbt = [get_movies(lgbt_url, n + 1, "NR") for n in range(pages_requested)]
pages_g_lgbt = [get_movies(lgbt_url, n + 1, "G") for n in range(pages_requested)]
pages_pg_lgbt = [get_movies(lgbt_url, n + 1, "PG") for n in range(pages_requested)]
pages_pg13_lgbt = [get_movies(lgbt_url, n + 1, "PG-13") for n in range(pages_requested)]
pages_r_lgbt = [get_movies(lgbt_url, n + 1, "R") for n in range(pages_requested)]
pages_nc17_lgbt = [get_movies(lgbt_url, n + 1, "NC-17") for n in range(pages_requested)]

# Combine the pages for each dataframe to single dataframe using the concat function (aka concatenate).
lgbt_movies_df = pd.concat(pages_lgbt)
lgbt_nr_df = pd.concat(pages_nr_lgbt)
lgbt_g_df = pd.concat(pages_g_lgbt) 
lgbt_pg_df = pd.concat(pages_pg_lgbt)
lgbt_pg13_df = pd.concat(pages_pg13_lgbt)
lgbt_r_df = pd.concat(pages_r_lgbt)
lgbt_nc17_df = pd.concat(pages_nc17_lgbt)

In [9]:
# Verify the length of each dataframe and that it was created correctly.
# (Note: there are no films in the G rating dataframe, meaning there are no LGBT films with a G rating.
# Therefore, g_lgbt_df will not be included in subsequent data wrangling).
print(len(lgbt_movies_df))
print(len(lgbt_nr_df))
print(len(lgbt_g_df))
print(len(lgbt_pg_df))
print(len(lgbt_pg13_df))
print(len(lgbt_r_df))
print(len(lgbt_nc17_df))
lgbt_movies_df.head()

1302
227
0
12
47
109
3


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,,[18],793992,en,Three Months,"A darkly comedic, coming-of-age film, that tel...",4.802,/AoMOVVkuVctRLyjfzjrEnPploxp.jpg,2022-02-23,Three Months,False,0.0,0
1,False,/iB1Ghglr4kiFCJiwC4yxCvWaUJ6.jpg,[10749],929477,en,Heart Shot,Teenagers Nikki and Sam are in love and planni...,5.602,/ubaCN0FjWhxiu1uHhI9oXSgalVK.jpg,2022-02-17,Heart Shot,False,0.0,0
2,False,/qQFvUg92Uh1jRXCNgqo9LduBDFD.jpg,[18],920345,en,EMPATHY (Or: The Girl with The Pearled Hair),After running away from her emotionally abusiv...,1.77,/59C1l5dqtKuKuKy5qlE23PV4yC6.jpg,2022-01-25,EMPATHY (Or: The Girl with The Pearled Hair),False,0.0,0
3,False,/rzu9BVI9HqY5rG0Xq05ZhewdkYP.jpg,"[18, 35]",641934,en,Am I Ok?,Lucy and Jane have been best friends for most ...,5.42,,2022-01-24,Am I Ok?,False,7.8,2
4,False,/yADB6tXh5g2kXK111rNa8eYLgEq.jpg,[18],914281,en,Starfuckers,An intimate evening between a film director an...,1.515,/m9wG2LfRVogDQ9xcUuY32SHIGxi.jpg,2022-01-22,Starfuckers,False,0.0,0


### Create the Isolated LGBT Ratings Dataframes

In [10]:
lgbt_nr_df = isolate_ratings(lgbt_nr_df, "NR")
lgbt_pg_df = isolate_ratings(lgbt_pg_df, "PG")
lgbt_pg13_df = isolate_ratings(lgbt_pg13_df, "PG-13")
lgbt_r_df = isolate_ratings(lgbt_r_df, "R")
lgbt_nc17_df = isolate_ratings(lgbt_nc17_df, "NC-17")

In [11]:
# Verify with one dataframe that it was created correctly.
lgbt_r_df.head()

Unnamed: 0,id,rating
0,915164,R
1,857731,R
2,552269,R
3,591273,R
4,634544,R


In [12]:
# Create a list of the individual ratings dataframes that contain values.
ratings_df_list = [lgbt_nr_df, lgbt_pg_df, lgbt_pg13_df, lgbt_r_df, lgbt_nc17_df]

### Create the Additional Info for LGBT Movies Dataframe

In [13]:
# Create the dataframe.
lgbt_info_df = get_info(lgbt_movies_df)

In [14]:
# Verify the dataframe was created correctly.
lgbt_info_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,,,0,"[{'id': 18, 'name': 'Drama'}]",,793992,tt5322004,en,Three Months,...,Post Production,It's amazing what you can discover when life g...,Three Months,False,0.0,0,,,,
1,False,/iB1Ghglr4kiFCJiwC4yxCvWaUJ6.jpg,,0,"[{'id': 10749, 'name': 'Romance'}]",https://www.netflix.com/title/81343173,929477,tt17162546,en,Heart Shot,...,Post Production,,Heart Shot,False,0.0,0,,,,
2,False,/qQFvUg92Uh1jRXCNgqo9LduBDFD.jpg,,20000,"[{'id': 18, 'name': 'Drama'}]",,920345,,en,EMPATHY (Or: The Girl with The Pearled Hair),...,Released,work in symmetry,EMPATHY (Or: The Girl with The Pearled Hair),False,0.0,0,,,,
3,False,/rzu9BVI9HqY5rG0Xq05ZhewdkYP.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,641934,tt11225626,en,Am I Ok?,...,Released,,Am I Ok?,False,7.8,3,,,,
4,False,/yADB6tXh5g2kXK111rNa8eYLgEq.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,914281,tt16409584,en,Starfuckers,...,Released,,Starfuckers,False,0.0,0,,,,


### Create the Cleaned and Combined LGBT Movies Dataframe

In [15]:
lgbt_movies_df = combine_clean_data(lgbt_movies_df, ratings_df_list, lgbt_info_df)

In [16]:
# Check to make the function ran and the dataframes joined correctly. (Calling a specific movie to verify)
print(len(lgbt_movies_df))
lgbt_movies_df.loc[lgbt_movies_df['title'] == "Rent"]

1302


Unnamed: 0,id,original_title,imdb_id,genre_ids,genres,rating,overview,popularity,release_date,budget,revenue,runtime,status,title
261,557648,Rent,tt6881910,"[10402, 10770, 18]","[Music, TV Movie, Drama]",NR-Manual,The story of several friends in New York City ...,7.861,2019-01-27,0,0,135,Released,Rent
1122,1833,Rent,tt0294870,"[18, 10749]","[Drama, Romance]",PG-13,This rock opera tells the story of one year in...,15.092,2005-11-17,40000000,31670620,135,Released,Rent


In [17]:
# Check another section of the dataframe.
lgbt_movies_df.loc[724:730]

Unnamed: 0,id,original_title,imdb_id,genre_ids,genres,rating,overview,popularity,release_date,budget,revenue,runtime,status,title
724,230058,Tumbledown,tt2732210,"[18, 9648, 10749]","[Drama, Mystery, Romance]",NR-Manual,"Inspired by true events, Todd Verow’s Tumbledo...",2.16,2013-10-25,0,0,80,Released,Tumbledown
725,712751,Any Given Tuesday,tt3621396,[18],[Drama],NR-Manual,Any Given Tuesday is an awareness short film a...,1.345,2013-10-17,0,0,7,Released,Any Given Tuesday
726,157370,Kill Your Darlings,tt1311071,"[18, 10749, 53]","[Drama, Romance, Thriller]",R,A murder in 1944 draws together the great poet...,14.396,2013-10-16,1030064,0,104,Released,Kill Your Darlings
727,270698,I'm a Porn Star,tt3076982,"[99, 35]","[Documentary, Comedy]",NR-Manual,I'm a Porn Star follows the lives of guys in t...,19.205,2013-10-04,0,0,81,Released,I'm a Porn Star
728,254924,Lost Angel,tt2796584,"[35, 18]","[Comedy, Drama]",NR-Manual,When a young Dutch tourist gets stranded at th...,0.715,2013-10-01,0,0,17,Released,Lost Angel
729,223093,Dead Woman's Hollow,tt1934229,"[27, 53, 80]","[Horror, Thriller, Crime]",NR-Manual,,1.791,2013-09-24,0,0,105,Released,Dead Woman's Hollow
730,211462,Ambrosia,tt1918669,[18],[Drama],NR-Manual,Ambrosia is the story of an Iranian-Canadian c...,3.64,2013-09-23,0,0,79,Released,Ambrosia


### Create the lgbt_movies CSV

In [18]:
# Export the movies_df into a CSV file.
lgbt_movies_df.to_csv("../CSVs/lgbt_movies.csv")

# Part 2: All Movies (2000-2022)

In [20]:
# Create a second URL that will allow the NR API to call through the remaining 56 pages of the call.
# Note: "NR" films are the only films that have more than 500 pages in the API call and the max number of pages the 
# call will return is 500 pages.

url2 = "https://api.themoviedb.org/3/discover/movie?api_key=" + tmdb_api_key + "&language=en-US&region=US&sort_by=primary_release_date.desc&certification_country=US&include_adult=false&include_video=false&primary_release_date.gte=2000-01-01&primary_release_date.lte=2004-09-21&with_original_language=en"

### Create Dataframes of All Movies

In [21]:
# The number of pages you want. (Note: the API can only parse through pages 500 at a time; the total amount of
# pages that the API call will iterate through to get all movies regardless of their certification rating).
pages_requested = 500

# Get all pages for each US rating as dataframes by calling the function "get_movies()" 
# where "page_num" is variable n + 1. The list comprehension will iterate through the range 
# of "pages_requested" which is 12, adding 1 to each iteration to make the count 1-12. The get_ratings function
# uses the second variable to specify the rating needed in the URL parameter.

# Note: there is no initial call for "all movies" as there are countless films that are counted as "movies", such 
# as music videos, that would be included in this API call. These films will not have received any sort of rating, 
# not even a true "NR" label. As such, the movies called will exclusively be movies that have a rating. They will be
# combined into a single dataframe below.

pages_nr1 = [get_movies(url, n + 1, "NR") for n in range(pages_requested)]
pages_nr2 = [get_movies(url2, n + 1, "NR") for n in range(pages_requested)]
pages_g = [get_movies(url, n + 1, "G") for n in range(pages_requested)]
pages_pg = [get_movies(url, n + 1, "PG") for n in range(pages_requested)]
pages_pg13 = [get_movies(url, n + 1, "PG-13") for n in range(pages_requested)]
pages_r = [get_movies(url, n + 1, "R") for n in range(pages_requested)]
pages_nc17 = [get_movies(url, n + 1, "NC-17") for n in range(pages_requested)]

# Combine the pages for each dataframe to single dataframe using the concat function (aka concatenate).
# Note: All individual ratings dataframes will be concatenated into a single dataframe titled "movies_df"
# Note: The "NR" pages dataframes must be concatenated into their own dataframes per 500 pages first
# and then into a new dataframe that contains all films that are not rated.
nr1_df = pd.concat(pages_nr1)
nr2_df = pd.concat(pages_nr2)
nr_df = pd.concat([nr1_df, nr2_df])

g_df = pd.concat(pages_g) 
pg_df = pd.concat(pages_pg)
pg13_df = pd.concat(pages_pg13)
r_df = pd.concat(pages_r)
nc17_df = pd.concat(pages_nc17)

In [22]:
# Concat all individual ratings dataframes into a single dataframe titled "movies_df"
movies_df = pd.concat([nr_df, g_df, pg_df, pg13_df, r_df, nc17_df], ignore_index=True)

In [23]:
# Verify the length of each dataframe and that it was created correctly.
print(len(movies_df))
print(len(nr_df))
print(len(g_df))
print(len(pg_df))
print(len(pg13_df))
print(len(r_df))
print(len(nc17_df))
movies_df.tail()

28640
11122
2509
3068
4382
7351
208


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
28635,False,,"[28, 53]",67932,en,Doomsdayer,"While seconds tick away, the faith of the worl...",2.276,/u2g68hWL8s7a02aVrDFjUPNBQ83.jpg,2001-01-01,Doomsdayer,False,4.5,7
28636,False,,"[35, 27, 9648, 28]",892259,en,Hunks on Haunted Hill,Nine young hardbodies are trapped together for...,1.208,/qBQ245aX4i7rEqw7sL1hgKUS7Ld.jpg,2000-10-31,Hunks on Haunted Hill,False,10.0,1
28637,False,,[9648],433111,en,Sex Files: Ancient Desires,A tomb raider reawakens an ancient female mumm...,1.828,,2000-10-14,Sex Files: Ancient Desires,False,4.0,2
28638,False,,"[35, 53]",9892,en,Stranger than Fiction,Lives spiral out of control when four friends ...,2.237,,2000-10-17,Stranger than Fiction,False,5.7,11
28639,False,,[53],592696,en,"Sex, Secrets & Betrayals",Maggie takes matters into her hands when her b...,1.4,/KfXqBOT0OG7cs0hCY7QrinZwLQ.jpg,2000-01-01,"Sex, Secrets & Betrayals",False,1.0,1


In [24]:
# Check for duplicates in the movies_df.
len(movies_df[movies_df.id.duplicated()])

71

In [25]:
# Drop duplicated movies that happened during the concatenation.
movies_df = movies_df.drop_duplicates(subset=['id'], keep='last').reset_index().drop(columns='index')

In [26]:
print(len(movies_df))
movies_df.tail()

28569


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
28564,False,,"[28, 53]",67932,en,Doomsdayer,"While seconds tick away, the faith of the worl...",2.276,/u2g68hWL8s7a02aVrDFjUPNBQ83.jpg,2001-01-01,Doomsdayer,False,4.5,7
28565,False,,"[35, 27, 9648, 28]",892259,en,Hunks on Haunted Hill,Nine young hardbodies are trapped together for...,1.208,/qBQ245aX4i7rEqw7sL1hgKUS7Ld.jpg,2000-10-31,Hunks on Haunted Hill,False,10.0,1
28566,False,,[9648],433111,en,Sex Files: Ancient Desires,A tomb raider reawakens an ancient female mumm...,1.828,,2000-10-14,Sex Files: Ancient Desires,False,4.0,2
28567,False,,"[35, 53]",9892,en,Stranger than Fiction,Lives spiral out of control when four friends ...,2.237,,2000-10-17,Stranger than Fiction,False,5.7,11
28568,False,,[53],592696,en,"Sex, Secrets & Betrayals",Maggie takes matters into her hands when her b...,1.4,/KfXqBOT0OG7cs0hCY7QrinZwLQ.jpg,2000-01-01,"Sex, Secrets & Betrayals",False,1.0,1


### Create the Isolated Ratings Dataframes

In [27]:
nr_df = isolate_ratings(nr_df, "NR")
g_df = isolate_ratings(g_df, "G")
pg_df = isolate_ratings(pg_df, "PG")
pg13_df = isolate_ratings(pg13_df, "PG-13")
r_df = isolate_ratings(r_df, "R")
nc17_df = isolate_ratings(nc17_df, "NC-17")

In [28]:
# Verify with one dataframe that it was created correctly.
r_df.head()

Unnamed: 0,id,rating
0,916821,R
1,895744,R
2,853408,R
3,845404,R
4,833425,R


In [29]:
# Create a list of the individual ratings dataframes that contain values.
ratings_df_list = [nr_df, g_df, pg_df, pg13_df, r_df, nc17_df]

### Create the Additional Info Dataframe

In [31]:
# Create the dataframe.
# Note: In order to not have the call timeout I split it into several calls becuse of how large the data set is.
info_df1 = get_info(movies_df.loc[0:2500])
info_df2 = get_info(movies_df.loc[2501:5000])
info_df3 = get_info(movies_df.loc[5001:7500])
info_df4 = get_info(movies_df.loc[7501:10000])

In [34]:
info_df5 = get_info(movies_df.loc[10001:12500])
info_df6 = get_info(movies_df.loc[12501:15000])
info_df7 = get_info(movies_df.loc[15001:17500])
info_df8 = get_info(movies_df.loc[17501:20000])

In [36]:
info_df9 = get_info(movies_df.loc[20001:22500])
info_df10 = get_info(movies_df.loc[22501:25000])
info_df11 = get_info(movies_df.loc[25001:27500])
info_df12 = get_info(movies_df.loc[27501:28569])

In [37]:
info_df = pd.concat(
    [info_df1, info_df2, info_df3, info_df4,info_df5,
    info_df6, info_df7, info_df8, info_df9, info_df10, info_df11, info_df12],
    ignore_index=True
)

In [38]:
# Verify the dataframe was created correctly.
info_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,success,status_code,status_message
0,False,,,0.0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",,925899.0,,en,"Oh, Detective!",...,False,0.0,0.0,,,,,,,
1,False,,,0.0,[],,933726.0,tt14972090,en,Weredeer,...,False,0.0,0.0,,,,,,,
2,False,/sYmXL2iSUbQLzRcFgsePOYgpFyg.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,865370.0,,en,The Last Mark,...,False,0.0,0.0,,,,,,,
3,False,/h4NnYzsl4MenvhOfm3DHD495ruX.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,624798.0,tt10308878,en,Big Gold Brick,...,False,0.0,0.0,,,,,,,
4,False,/wVTtQ2ZcfsNwDmTlT4GrrDRpS8b.jpg,,0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 10749...",https://lotawanamovie.com,879531.0,tt4664346,en,Lotawana,...,False,0.0,0.0,,,,,,,


### Create the Cleaned and Combined Movies Dataframe

In [39]:
movies_df = combine_clean_data(movies_df, ratings_df_list, info_df)

In [40]:
# Check to make the function ran and the dataframes joined correctly. (Calling a specific movie to verify)
print(len(movies_df))
movies_df.loc[movies_df['title'] == "Rent"]

28569


Unnamed: 0,id,original_title,imdb_id,genre_ids,genres,rating,overview,popularity,release_date,budget,revenue,runtime,status,title
23554,1833,Rent,tt0294870,"[18, 10749]","[Drama, Romance]",PG-13,This rock opera tells the story of one year in...,15.092,2005-11-17,40000000.0,31670620.0,135,Released,Rent


In [41]:
# Check another section of the dataframe.
movies_df.loc[12560:12565]

Unnamed: 0,id,original_title,imdb_id,genre_ids,genres,rating,overview,popularity,release_date,budget,revenue,runtime,status,title
12560,187596,Walk of Shame,tt2463288,[35],[Comedy],R,A reporter's dream of becoming a news anchor i...,26.595,2014-05-02,0.0,8122990.0,95,Released,Walk of Shame
12561,177047,Decoding Annie Parker,tt1464191,[18],[Drama],R,The lives of a breast-cancer patient and a res...,5.76,2014-05-02,2000000.0,48390.0,91,Released,Decoding Annie Parker
12562,550952,Having Fun Up There,tt3457798,"[35, 18]","[Comedy, Drama]",NR,"Having Fun Up There follows Mark, a 37 year ol...",1.497,2014-05-02,0.0,0.0,66,Released,Having Fun Up There
12563,102382,The Amazing Spider-Man 2,tt1872181,"[28, 12, 14]","[Action, Adventure, Fantasy]",PG-13,"For Peter Parker, life is busy. Between taking...",342.284,2014-05-02,200000000.0,708962323.0,142,Released,The Amazing Spider-Man 2
12564,248688,A Night in Old Mexico,tt2308260,[18],[Drama],NR,"Forced to give up his land and home, Texas ran...",6.095,2014-05-01,2000000.0,0.0,103,Released,A Night in Old Mexico
12565,285264,Steve Byrne: Champion,tt3526542,[35],[Comedy],NR,"Steve Byrne, the star of Sullivan &amp; Son, b...",2.602,2014-05-01,0.0,0.0,69,Released,Steve Byrne: Champion


### Create the movies CSV

In [43]:
# Export the movies_df into a CSV file.
movies_df.to_csv("../CSVs/movies.csv")