### Import Dependencies

In [1]:
# Import the requests library and API key
import pandas as pd
import requests
from config import tmdb_api_key

### Create starting URLs for the APIs

In [2]:
# Starting URL for TMDB API discover method including the API key from config.py. Using the language (English),
# region (US), release date start (01-01-2000) and end (12-31-2022), sort by (release date descending) parameters.
# Not included are the "page" and "certification" parameters which will be included in the function later below.
url = "https://api.themoviedb.org/3/discover/movie?api_key=" + tmdb_api_key + "&language=en-US&region=US&sort_by=primary_release_date.desc&certification_country=US&include_adult=false&include_video=false&primary_release_date.gte=2000-01-01&primary_release_date.lte=2022-03-01&with_original_language=en"

In [3]:
# Create URL for movies tagged with keywords "LGBT, gay, lesbian, trans, transgender, and gay teen."
lgbt_url = url + "&with_keywords=158718%7C264384%7C264386%7C273637%7C290527%7C163037"

### Create Functions to Gather Info from APIs

In [4]:
# Create a function that can take different URLs that you input to iterate through multiple pages of 
# the API call and to constrain the search to particular certification ratings. Specifying the params 
# for params with a dictionary containing the key/value pair for "page" and "certification".

def get_movies(url_name, page_num, rating):
    movies = requests.get(
        url_name,
        params={
            "page": page_num,
            "certification": rating}
    )
    # Create a variable to hold the JSON text of the 'get' request
    movies_data = movies.json()
    # Create a Pandas Dataframe for 'results' from the JSON dictionary
    return pd.DataFrame(movies_data['results'])

In [5]:
# Create a function that adds ratings into each individual rating dataframe.
def isolate_ratings(rating_df, rating):
    rating_df['rating'] = rating
    rating_df = rating_df.loc[:, ('id', 'rating')]
    return rating_df

In [6]:
# Create a function to get the additional movie information for every individual movie ID in the movies dataframe.

def get_info(movie_df):
    def gather_info(movie_url,id_list):
    #     Create a function to input each individual movie ID into the movie_id_url.
        def get_id_url(id_index):
            movie_id_url = "https://api.themoviedb.org/3/movie/" + str(id_list[id_index]) + "?api_key=" + tmdb_api_key
            return movie_id_url
    #     Create a list of movie URLs based on each movie's ID.
        movie_id_urls = [get_id_url(n) for n in range(len(id_list))]
        info_request = requests.get(movie_id_urls[movie_url])
        info_data = info_request.json()
        return pd.json_normalize(info_data)
    
    # Create a list of just the movie IDs so it can be called upon.
    id_list = movie_df['id'].tolist()
    
    # Get each movie's additional info by calling the function where n is the index of the ids list
    # and lgbts_ids is each movie's individual id.
    info = [gather_info(n, id_list) for n in range(len(id_list))]

    # Combine each movie's additional info dataframe into one.
    info_df = pd.concat(info)
    
    return info_df.reset_index().drop(columns='index')

In [34]:
print("https://api.themoviedb.org/3/genre/movie/list?api_key=" + tmdb_api_key + "&language=en-US")

https://api.themoviedb.org/3/genre/movie/list?api_key=203d32539797d91b7d9372a88e7b93bc&language=en-US


In [21]:
# Get genre inforomation
def get_genre(main_df):
    # Add genre names
    # Create genre_url variable.
    genre_url = "https://api.themoviedb.org/3/genre/movie/list?api_key=" + tmdb_api_key + "&language=en-US"
    
    # Request the genre JSON info into a list.
    genre_request = requests.get(genre_url)
    genre_data = genre_request.json()
    genre_info = genre_data['genres']
    
    # Transform the list of dictionaries of genre info into a individual lists of the genre IDs and genre names.
    ids = [i['id'] for i in genre_info]
    names = [n['name'] for n in genre_info]
    
    # Duplicate id and genre_ids column from the main movie_df into new dataframe.
    genres_df = pd.DataFrame()
    genres_df[['id', 'genres']] = main_df[['id','genre_ids']]
    
    # Find the list that has the max length of the genres column in order to create the correct number of
    # columns in sep_genres_df.
    length_of_genres = pd.DataFrame(genres_df['genres'].str.len())
    len_genres = length_of_genres['genres'].to_list()
    len_genres = max(len_genres)
    
    # Create a nondestructive dataframe to separate the lists of genres in the genre column to hold one individual 
    # genre per column.
    sep_genres_df = pd.DataFrame(
        genres_df['genres'].to_list(), columns=[n for n in range(len_genres)])
    
    # Replace all genre integer values for "genre_ids" to their associated string in genres_df.
    sep_genres_df = sep_genres_df.replace(ids, names)
    # Create a new column that puts all genre names in every column of the dataframe into a list per movie.
    sep_genres_df['genres'] = sep_genres_df.values.tolist()
    
    # Remove the null values from the list.
    sep_genres_df['genres'] = sep_genres_df.genres.apply(lambda x: [name for name in x if not pd.isnull(name)])
    
#     # Drop individual genre columns.
#     genre_names = genre_names[['genres']]
    
#     # Add genre ids back to the sep_genres_df
#     genre_names['genre_ids'] = genres_df['genres']
    
#     # Create another dataframe to hold the separated IDs so to add the list of names and have separate genre IDs
#     sep_genres_df = pd.DataFrame(
#         genres_df['genres'].to_list(), columns=[n for n in range(len_genres)])
    
    # Take the movie IDs from the genre_df and copy them into the nondestructive dataframe sep_genre_df.
#     sep_genres_df['id'] = genres_df['id']
#     # Set the index for sep_genre_df to the movie's id.
#     sep_genres_df = sep_genres_df.set_index('id')
    
    return sep_genres_df, genres_df
    
    # Join genres column from sep_genres_df into the main movie_df
#     movie_df = movie_df.join(sep_genres_df, on='id', how='left')

In [27]:
# Create a funtion to combine all previous API dataframes into one. This includes the ratings info, the movie's info
# and the genre names based on genre IDs.
def combine_clean_data(main_df, rating_df_list, info_df):
    
    # Create a nondestructive copy of the main_df
    movie_df = main_df.set_index('id').drop(columns='overview')
    
    # Join the individual ratings dataframes from the "isolate_ratings" function into a single dataframe.
    ratings_df = pd.concat(rating_df_list).set_index('id')

    # Join the ratings dataframe into the movies dataframe to add the certification rating to the movies dataframe.
    movie_df = movie_df.join(ratings_df, on='id', how='left').reset_index()
    # Drop duplicates of any films. We keep the 'last' duplicate in order to ensure the film has a rating vs NR.
    movie_df = movie_df.drop_duplicates(subset=['id'], keep='last').reset_index().drop(columns='index')
    
    # Swap null values in ratings column for "NR-Manual" to indicate that these films did not have any certification
    # rating label -- not even a proper "NR".
    movie_df['rating'].fillna('NR-Manual', inplace=True)
    
    # Create a new dataframe that takes only the necessary information from the info_df gathered using 
    # the "get_info" function.
    add_info_df = info_df[['id', 'budget', 'imdb_id', 'revenue', 'runtime', 'status']].set_index('id')
    
    # Convert budget and revenue columns into int
    add_info_df['budget'] = add_info_df['budget'].astype(int)
    add_info_df['revenue']= add_info_df['revenue'].astype(int)
    
    # Join the additional info to the main movie_df.
    movie_df = movie_df.join(add_info_df, on='id', how='left')
    
    # Convert release_date column to datetime
    movie_df['release_date'] = pd.to_datetime(movie_df['release_date'])
    # Replace N/A values in budget and revenue with 0
    movie_df['budget'] = movie_df['budget'].fillna(0)
    movie_df['revenue']= movie_df['revenue'].fillna(0)
    # Convert budget and revenue columns into int
    movie_df['budget'] = movie_df['budget'].astype(int)
    movie_df['revenue']= movie_df['revenue'].astype(int)

    # Sort movies_df by release date.
    movie_df = movie_df.sort_values(by='release_date',ascending=False).reset_index().drop(columns='index')
    
    # Reorder columns.
    new_columns = [
        'id', 'original_title', 'imdb_id', 'genre_ids', 'rating', 'popularity',
        'release_date','budget', 'revenue', 'runtime', 'status', 'title']
    return movie_df[new_columns]
    

# Part 1: LGBT Movies

### Create Dataframes of Only LGBT Movies

In [9]:
# The number of pages you want. (Note: the max number of pages for the ratings comes from the total amount of
# pages that the API call will iterate through to get all movies regardless of their certification rating).
pages_requested = 67

# Get all pages for each US rating as dataframes by calling the function "get_movies()" 
# where "page_num" is variable n + 1. The list comprehension will iterate through the range 
# of "pages_requested" which is 12, adding 1 to each iteration to make the count 1-12. The get_ratings function
# uses the second variable to specify the rating needed in the URL parameter.
pages_lgbt = [get_movies(lgbt_url, n + 1, "") for n in range(pages_requested)]
pages_nr_lgbt = [get_movies(lgbt_url, n + 1, "NR") for n in range(pages_requested)]
pages_g_lgbt = [get_movies(lgbt_url, n + 1, "G") for n in range(pages_requested)]
pages_pg_lgbt = [get_movies(lgbt_url, n + 1, "PG") for n in range(pages_requested)]
pages_pg13_lgbt = [get_movies(lgbt_url, n + 1, "PG-13") for n in range(pages_requested)]
pages_r_lgbt = [get_movies(lgbt_url, n + 1, "R") for n in range(pages_requested)]
pages_nc17_lgbt = [get_movies(lgbt_url, n + 1, "NC-17") for n in range(pages_requested)]

# Combine the pages for each dataframe to single dataframe using the concat function (aka concatenate).
lgbt_movies_df = pd.concat(pages_lgbt)
lgbt_nr_df = pd.concat(pages_nr_lgbt)
lgbt_g_df = pd.concat(pages_g_lgbt) 
lgbt_pg_df = pd.concat(pages_pg_lgbt)
lgbt_pg13_df = pd.concat(pages_pg13_lgbt)
lgbt_r_df = pd.concat(pages_r_lgbt)
lgbt_nc17_df = pd.concat(pages_nc17_lgbt)

In [10]:
# Verify the length of each dataframe and that it was created correctly.
# (Note: there are no films in the G rating dataframe, meaning there are no LGBT films with a G rating.
# Therefore, g_lgbt_df will not be included in subsequent data wrangling).
print(len(lgbt_movies_df))
print(len(lgbt_nr_df))
print(len(lgbt_g_df))
print(len(lgbt_pg_df))
print(len(lgbt_pg13_df))
print(len(lgbt_r_df))
print(len(lgbt_nc17_df))
lgbt_movies_df.head()

1302
228
0
12
47
109
3


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/tRQDhEq23XuF0GRM5yItJFnlSTl.jpg,[18],793992,en,Three Months,"A darkly comedic, coming-of-age film, that tel...",6.566,/AoMOVVkuVctRLyjfzjrEnPploxp.jpg,2022-02-23,Three Months,False,0.0,0
1,False,/iB1Ghglr4kiFCJiwC4yxCvWaUJ6.jpg,[10749],929477,en,Heart Shot,Teenagers Nikki and Sam are in love and planni...,15.03,/ubaCN0FjWhxiu1uHhI9oXSgalVK.jpg,2022-02-17,Heart Shot,False,0.0,0
2,False,/kA81FJCNRH8AnGbjNAyuOvfUgmx.jpg,"[9648, 10749, 18]",937688,en,Temporal Fate,A time traveling trans woman turns back time t...,2.744,/uij3RuNSUNNPHgH9uqdd7bsER8L.jpg,2022-02-03,Temporal Fate,False,0.0,0
3,False,/qQFvUg92Uh1jRXCNgqo9LduBDFD.jpg,[18],920345,en,EMPATHY (Or: The Girl with The Pearled Hair),After running away from her emotionally abusiv...,1.988,/59C1l5dqtKuKuKy5qlE23PV4yC6.jpg,2022-01-25,EMPATHY (Or: The Girl with The Pearled Hair),False,0.0,0
4,False,/rzu9BVI9HqY5rG0Xq05ZhewdkYP.jpg,"[18, 35]",641934,en,Am I Ok?,Lucy and Jane have been best friends for most ...,5.086,,2022-01-24,Am I Ok?,False,7.8,3


### Create the Isolated LGBT Ratings Dataframes

In [11]:
lgbt_nr_df = isolate_ratings(lgbt_nr_df, "NR")
lgbt_pg_df = isolate_ratings(lgbt_pg_df, "PG")
lgbt_pg13_df = isolate_ratings(lgbt_pg13_df, "PG-13")
lgbt_r_df = isolate_ratings(lgbt_r_df, "R")
lgbt_nc17_df = isolate_ratings(lgbt_nc17_df, "NC-17")

# Verify with one dataframe that it was created correctly.
lgbt_r_df.head()

Unnamed: 0,id,rating
0,915164,R
1,857731,R
2,552269,R
3,591273,R
4,634544,R


In [12]:
# Create a list of the individual ratings dataframes that contain values.
ratings_df_list = [lgbt_nr_df, lgbt_pg_df, lgbt_pg13_df, lgbt_r_df, lgbt_nc17_df]

### Create the Additional Info for LGBT Movies Dataframe

In [13]:
# Create the dataframe.
lgbt_info_df = get_info(lgbt_movies_df)
# Verify the dataframe was created correctly.
lgbt_info_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/tRQDhEq23XuF0GRM5yItJFnlSTl.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,793992,tt5322004,en,Three Months,...,Post Production,It's amazing what you can discover when life g...,Three Months,False,0.0,0,,,,
1,False,/iB1Ghglr4kiFCJiwC4yxCvWaUJ6.jpg,,0,"[{'id': 10749, 'name': 'Romance'}]",https://www.netflix.com/title/81343173,929477,tt17162546,en,Heart Shot,...,Released,,Heart Shot,False,0.0,0,,,,
2,False,/kA81FJCNRH8AnGbjNAyuOvfUgmx.jpg,,100,"[{'id': 9648, 'name': 'Mystery'}, {'id': 10749...",http://youtu.be/t1WRlGT9OCM,937688,,en,Temporal Fate,...,Released,Everything's going to be okay.,Temporal Fate,False,0.0,0,,,,
3,False,/qQFvUg92Uh1jRXCNgqo9LduBDFD.jpg,,20000,"[{'id': 18, 'name': 'Drama'}]",,920345,,en,EMPATHY (Or: The Girl with The Pearled Hair),...,Released,work in symmetry,EMPATHY (Or: The Girl with The Pearled Hair),False,0.0,0,,,,
4,False,/rzu9BVI9HqY5rG0Xq05ZhewdkYP.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,641934,tt11225626,en,Am I Ok?,...,Released,,Am I Ok?,False,7.8,3,,,,


### Create the LGBT Genres Dataframe

In [44]:
lgbt_genres_df = get_genre(lgbt_movies_df)
lgbt_genres_df[1].head()

Unnamed: 0,id,genres
0,793992,[18]
1,929477,[10749]
2,937688,"[9648, 10749, 18]"
3,920345,[18]
4,641934,"[18, 35]"


In [48]:
lgbt_genres_df[0].columns = ['genre1', 'genre2', 'genre3', 'genre4', 'genre5', 'genre6', 'genres']
genre_test = lgbt_genres_df[0]

In [69]:
genre_test[['genre1', 'genres']]

Unnamed: 0,genre1,genres
0,Drama,[Drama]
1,Romance,[Romance]
2,Mystery,"[Mystery, Romance, Drama]"
3,Drama,[Drama]
4,Drama,"[Drama, Comedy]"
...,...,...
1297,Comedy,"[Comedy, Romance]"
1298,Drama,[Drama]
1299,,[]
1300,Documentary,[Documentary]


In [68]:
len(genre_test[genre_test.ne("Documentary").all(axis=1)])

1082

### Create the Cleaned and Combined LGBT Movies Dataframe

In [28]:
lgbt_movies_df = combine_clean_data(lgbt_movies_df, ratings_df_list, lgbt_info_df)

In [29]:
# Check to make the function ran and the dataframes joined correctly. (Calling a specific movie to verify)
print(len(lgbt_movies_df))
lgbt_movies_df.loc[lgbt_movies_df['title'] == "Rent"]

1302


Unnamed: 0,id,original_title,imdb_id,genre_ids,rating,popularity,release_date,budget,revenue,runtime,status,title
261,557648,Rent,tt6881910,"[10402, 10770, 18]",NR-Manual,7.783,2019-01-27,0,0,135,Released,Rent
1122,1833,Rent,tt0294870,"[18, 10749]",PG-13,17.697,2005-11-17,40000000,31670620,135,Released,Rent


In [30]:
# Check another section of the dataframe.
lgbt_movies_df.tail()

Unnamed: 0,id,original_title,imdb_id,genre_ids,rating,popularity,release_date,budget,revenue,runtime,status,title
1297,248757,Punks,tt0160710,"[35, 10749]",PG-13,1.102,2000-01-24,0,0,91,Released,Punks
1298,515728,Hitch,tt0191181,[18],NR-Manual,0.84,2000-01-01,0,0,18,Released,Hitch
1299,306484,Meet Joe Gay,tt1063334,[],NR-Manual,1.337,2000-01-01,0,0,25,Released,Meet Joe Gay
1300,262942,A Boy Named Sue,tt0297034,[99],NR-Manual,1.008,2000-01-01,0,0,69,Released,A Boy Named Sue
1301,49156,Take-out,tt0283033,[],NR-Manual,0.6,2000-01-01,0,0,37,Released,Take-out


In [67]:
lgbt_movies_df[lgbt_movies_df['genre_ids'] == [99]]

ValueError: ('Lengths must match to compare', (1302,), (1,))

### Create the lgbt CSVs

In [32]:
# Export the movies_df into a CSV file.
lgbt_movies_df.to_csv("../CSVs/lgbt_movies2.csv")
# lgbt_genres_df.to_csv("../CSVs/lgbt_genres.csv")

# Part 2: All Movies (2000-2022)

In [19]:
# Create a second URL that will allow the NR API to call through the remaining 56 pages of the call.
# Note: "NR" films are the only films that have more than 500 pages in the API call and the max number of pages the 
# call will return is 500 pages.

url2 = "https://api.themoviedb.org/3/discover/movie?api_key=" + tmdb_api_key + "&language=en-US&region=US&sort_by=primary_release_date.desc&certification_country=US&include_adult=false&include_video=false&primary_release_date.gte=2000-01-01&primary_release_date.lte=2004-09-21&with_original_language=en"

### Create Dataframes of All Movies

In [21]:
# The number of pages you want. (Note: the API can only parse through pages 500 at a time; the total amount of
# pages that the API call will iterate through to get all movies regardless of their certification rating).
pages_requested = 500

# Get all pages for each US rating as dataframes by calling the function "get_movies()" 
# where "page_num" is variable n + 1. The list comprehension will iterate through the range 
# of "pages_requested" which is 12, adding 1 to each iteration to make the count 1-12. The get_ratings function
# uses the second variable to specify the rating needed in the URL parameter.

# Note: there is no initial call for "all movies" as there are countless films that are counted as "movies", such 
# as music videos, that would be included in this API call. These films will not have received any sort of rating, 
# not even a true "NR" label. As such, the movies called will exclusively be movies that have a rating. They will be
# combined into a single dataframe below.

pages_nr1 = [get_movies(url, n + 1, "NR") for n in range(pages_requested)]
pages_nr2 = [get_movies(url2, n + 1, "NR") for n in range(pages_requested)]
pages_g = [get_movies(url, n + 1, "G") for n in range(pages_requested)]
pages_pg = [get_movies(url, n + 1, "PG") for n in range(pages_requested)]
pages_pg13 = [get_movies(url, n + 1, "PG-13") for n in range(pages_requested)]
pages_r = [get_movies(url, n + 1, "R") for n in range(pages_requested)]
pages_nc17 = [get_movies(url, n + 1, "NC-17") for n in range(pages_requested)]

# Combine the pages for each dataframe to single dataframe using the concat function (aka concatenate).
# Note: All individual ratings dataframes will be concatenated into a single dataframe titled "movies_df"
# Note: The "NR" pages dataframes must be concatenated into their own dataframes per 500 pages first
# and then into a new dataframe that contains all films that are not rated.
nr1_df = pd.concat(pages_nr1)
nr2_df = pd.concat(pages_nr2)
nr_df = pd.concat([nr1_df, nr2_df])

g_df = pd.concat(pages_g) 
pg_df = pd.concat(pages_pg)
pg13_df = pd.concat(pages_pg13)
r_df = pd.concat(pages_r)
nc17_df = pd.concat(pages_nc17)

In [22]:
# Concat all individual ratings dataframes into a single dataframe titled "movies_df"
movies_df = pd.concat([nr_df, g_df, pg_df, pg13_df, r_df, nc17_df], ignore_index=True)

In [23]:
# Verify the length of each dataframe and that it was created correctly.
print(len(movies_df))
print(len(nr_df))
print(len(g_df))
print(len(pg_df))
print(len(pg13_df))
print(len(r_df))
print(len(nc17_df))
movies_df.tail()

28673
11122
2507
3072
4383
7352
237


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
28668,False,,"[28, 53]",67932,en,Doomsdayer,"While seconds tick away, the faith of the worl...",1.731,/u2g68hWL8s7a02aVrDFjUPNBQ83.jpg,2001-01-01,Doomsdayer,False,4.5,7
28669,False,,"[35, 27, 9648, 28]",892259,en,Hunks on Haunted Hill,Nine young hardbodies are trapped together for...,1.463,/qBQ245aX4i7rEqw7sL1hgKUS7Ld.jpg,2000-10-31,Hunks on Haunted Hill,False,10.0,1
28670,False,,[9648],433111,en,Sex Files: Ancient Desires,A tomb raider reawakens an ancient female mumm...,1.336,,2000-10-14,Sex Files: Ancient Desires,False,4.0,2
28671,False,,"[35, 53]",9892,en,Stranger than Fiction,Lives spiral out of control when four friends ...,3.019,,2000-10-17,Stranger than Fiction,False,5.7,11
28672,False,,[53],592696,en,"Sex, Secrets & Betrayals",Maggie takes matters into her hands when her b...,1.005,/KfXqBOT0OG7cs0hCY7QrinZwLQ.jpg,2000-01-01,"Sex, Secrets & Betrayals",False,1.0,1


In [24]:
# Check for duplicates in the movies_df.
len(movies_df[movies_df.id.duplicated()])

70

In [25]:
# Drop duplicated movies that happened during the concatenation.
movies_df = movies_df.drop_duplicates(subset=['id'], keep='last').reset_index().drop(columns='index')

print(len(movies_df))
movies_df.tail()

### Create the Isolated Ratings Dataframes

In [27]:
nr_df = isolate_ratings(nr_df, "NR")
g_df = isolate_ratings(g_df, "G")
pg_df = isolate_ratings(pg_df, "PG")
pg13_df = isolate_ratings(pg13_df, "PG-13")
r_df = isolate_ratings(r_df, "R")
nc17_df = isolate_ratings(nc17_df, "NC-17")

# Verify with one dataframe that it was created correctly.
r_df.head()

In [29]:
# Create a list of the individual ratings dataframes that contain values.
ratings_df_list = [nr_df, g_df, pg_df, pg13_df, r_df, nc17_df]

### Create the Additional Info Dataframe

In [30]:
# Create the dataframe.
# Note: In order to not have the call timeout I split it into several calls becuse of how large the data set is.
info_df1 = get_info(movies_df.loc[0:2500])
info_df2 = get_info(movies_df.loc[2501:5000])
info_df3 = get_info(movies_df.loc[5001:7500])
info_df4 = get_info(movies_df.loc[7501:10000])

In [32]:
info_df5 = get_info(movies_df.loc[10001:12500])
info_df6 = get_info(movies_df.loc[12501:15000])
info_df7 = get_info(movies_df.loc[15001:17500])
info_df8 = get_info(movies_df.loc[17501:20000])

In [34]:
info_df9 = get_info(movies_df.loc[20001:22500])
info_df10 = get_info(movies_df.loc[22501:25000])
info_df11 = get_info(movies_df.loc[25001:27500])
info_df12 = get_info(movies_df.loc[27501:28575])

In [35]:
info_df = pd.concat(
    [info_df1, info_df2, info_df3, info_df4,info_df5,
    info_df6, info_df7, info_df8, info_df9, info_df10, info_df11, info_df12],
    ignore_index=True
)

In [37]:
# Verify the dataframe was created correctly.
info_df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,,,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",,925899,,en,"Oh, Detective!",...,In Production,March 17th. Triple Homicide.,"Oh, Detective!",False,0.0,0,,,,
1,False,,,0,[],,933726,tt14972090,en,Weredeer,...,In Production,,Weredeer,False,0.0,0,,,,
2,False,/sYmXL2iSUbQLzRcFgsePOYgpFyg.jpg,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,865370,,en,The Last Mark,...,Released,Killing is simple. Until it gets complicated.,The Last Mark,False,0.0,0,,,,
3,False,/h4NnYzsl4MenvhOfm3DHD495ruX.jpg,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,624798,tt10308878,en,Big Gold Brick,...,Post Production,A 'cerebral' comedy.,Big Gold Brick,False,0.0,0,,,,
4,False,/wVTtQ2ZcfsNwDmTlT4GrrDRpS8b.jpg,,0,"[{'id': 12, 'name': 'Adventure'}, {'id': 10749...",https://lotawanamovie.com,879531,tt4664346,en,Lotawana,...,Post Production,Let go.,Lotawana,False,0.0,0,,,,


### Create the Cleaned and Combined Movies Dataframe

In [38]:
movies_df = combine_clean_data(movies_df, ratings_df_list, info_df)

In [39]:
# Check to make the function ran and the dataframes joined correctly. (Calling a specific movie to verify)
print(len(movies_df))
movies_df.loc[movies_df['title'] == "Rent"]

28603


Unnamed: 0,id,original_title,imdb_id,genre_ids,genres,rating,popularity,release_date,budget,revenue,runtime,status,title
23597,1833,Rent,tt0294870,"[18, 10749]","[Drama, Romance]",PG-13,19.054,2005-11-17,40000000,31670620,135,Released,Rent


In [40]:
# Check another section of the dataframe.
movies_df.loc[12560:12565]

Unnamed: 0,id,original_title,imdb_id,genre_ids,genres,rating,popularity,release_date,budget,revenue,runtime,status,title
12560,268321,Squatters,tt2359307,[18],[Drama],R,6.194,2014-05-14,0,0,102,Released,Squatters
12561,304547,Leave it to Beavers,,"[10751, 99]","[Family, Documentary]",G,1.525,2014-05-14,0,0,0,Released,Leave it to Beavers
12562,367403,Billy Zabka Saves The Universe,,"[878, 35]","[Science Fiction, Comedy]",NR,0.6,2014-05-14,0,0,25,Released,Billy Zabka Saves The Universe
12563,903091,Monkey,tt4063572,"[18, 9648, 878, 53]","[Drama, Mystery, Science Fiction, Thriller]",PG-13,0.625,2014-05-13,500,0,30,Released,Monkey
12564,96631,Piggy,tt1951218,[53],[Thriller],R,4.348,2014-05-13,900000,0,106,Released,Piggy
12565,297102,Scooby-Doo! Ghastly Goals,tt3807092,"[10751, 9648, 16, 35]","[Family, Mystery, Animation, Comedy]",NR,9.363,2014-05-13,0,0,22,Released,Scooby-Doo! Ghastly Goals


In [41]:
len(movies_df.columns)

13

### Create the Genres Dataframe

In [115]:
genres_df = get_genre(movies_df)
genres_df.head()

In [119]:
lgbt_genres_df.columns = ['genre1', 'genre2', 'genre3', 'genre4', 'genre5', 'genre6', 'genre7', 'genre8', 'genre9', 'genres']
genres_df.head()

In [53]:
# movies_df.loc[movies_df['title'] == "'Silent Running': A Conversation with Bruce Dern, 'Lowell Freeman'"]

Unnamed: 0,id,original_title,imdb_id,genre_ids,genres,rating,popularity,release_date,budget,revenue,runtime,status,title
26862,861081,'Silent Running': A Conversation with Bruce De...,tt0795533,[99],[Documentary],NR,0.6,2002-05-21,0,0,11,Released,'Silent Running': A Conversation with Bruce De...


### Create the movies CSV

In [44]:
# Export the movies_df into a CSV file.
movies_df.to_csv("../CSVs/movies.csv")
genres_df.to_csv("../CSVs/genres.csv")