In [1]:
#!/usr/bin/env python
# coding: utf-8

# importing modules

import pandas as pd
import requests
import json
import config

## Extract

In [2]:
# For this exercise, we’re going to request 6 movies with movie_id ranging from 550 to 555. We create a loop that requests each movie one at a time and appends the response to a list.

response_list = []
API_KEY = config.api_key

# send a single GET request to the API. In the response, we receive a JSON record with the movie_id's we specify
for movie_id in range(550,556):
    r = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id, API_KEY))
    response_list.append(r.json()) 

In [3]:
# We now have a list of long, unwieldy JSON records delivered to us from the API.
print(type(response_list))

# Creating a pandas dataframe from the records using from_dict()
df = pd.DataFrame.from_dict(response_list)

<class 'list'>


In [None]:
# Now, we have the extract data
df

## Transform

The genres column is a column of lists of JSON records, which is hard to read or quickly understand in this format. We want to expand this column out so we can easily see and make use of the internal records.

In [5]:
genres_list = df['genres'].tolist()

# It is a list of a dictionaries lists
#genres_list

# Example 1
# tree dictionaries in one list
# List1[{Dic1},
#       {Dic2},
#       {Dic3}]
# So, it is a list of dictionaries

# Example 2 (our case)
# tree dictionaries in one list
# List1[
#   SubList1[{Dic1},
#            {Dic2},
#            {Dic3}],
#   SubList2[{Dic1},
#            {Dic2},
#            {Dic3}],
#   SubList3[{Dic1},
#            {Dic2},
#            {Dic3}]]

# So, it is a list of a dictionaries lists
# or it is a list of a list of dictionaries

In [7]:

# fruits = ["apple", "banana", "cherry", "kiwi", "mango"]
# newlist = [x for x in fruits if "a" in x] 
# for x in fruits: 
#   if "a" in x: 
#       newlist.append(x)
# https://www.w3schools.com/python/python_lists_comprehension.asp


# Since, genres_list is a regular List of Lists (2d_list), so the codes below both work for this case:
# create a flat list based on an existing 2D list
flat_list = [item for sublist in genres_list for item in sublist]

# above is the same as below
# newlist = []
# for sublist in genres_list: # I take 
#     #newlist.append(sublist) # Example 2 (our case)
#     for item in sublist:
#         newlist.append(item) # Example 1
# newlist 
# have in mind that flat_list is equal to newlist

#flat_list

# for to know more about what is a flat list:
# https://stackabuse.com/python-how-to-flatten-list-of-lists/

In [None]:
#list(flat_list[0].values())[1]
# dict.values()[index]

In [8]:
# Creating a separate table for genres
df_genres = pd.DataFrame.from_records(flat_list).drop_duplicates()
df_genres

# This gives us a table of the genre properties name and id

Unnamed: 0,id,name
0,18,Drama
1,53,Thriller
2,35,Comedy
3,28,Action
4,12,Adventure
7,10749,Romance
8,80,Crime
12,36,History


Creating a column of lists to explode out. 


In [9]:
result = []
for l in genres_list:
    r = []
    for d in l:
        r.append(d['name'])
    result.append(r)

# creating a temporary column, genres_all, as a list of lists of genres that we can later expand out into a separate column for each genre    
df = df.assign(genres_all=result)
# dataframe.assign() - returning a new object (a copy) with the new columns added to the original ones.

In [None]:
#result

In [None]:
#df['genres_all']

In [None]:
# for l in genres_list:
#     r = []
#     for d in l:
#         r.append(d['name'])
#     result.append(r)

#flat_list = [item for sublist in genres_list for item in sublist]

#res_new = [d['name'] for l in genres_list for d  in l]   
#res_new 
# it is different from above

In [None]:
#df

In [10]:
# to select the columns we want from the main dataframe
df_columns = ['budget', 'id', 'imdb_id', 'original_title', 'release_date', 'revenue', 'runtime']

# creating a list of genres from that df_genres
df_genre_columns = df_genres['name'].to_list()

# adding the genres as columns
df_columns.extend(df_genre_columns)

In [18]:
#  to create the genre columns
s = df['genres_all'].explode() # Exploded lists to rows; index will be duplicated for these rows.

#  and join them onto the main table.
df = df.join(pd.crosstab(s.index, s))

In [28]:
# to understand it better

# it is a Series Object
#print(s) 

# crosstab - by default computes a frequency table of the factors 
#pd.crosstab(index = s.index, columns = s)

0        Drama
0     Thriller
0       Comedy
1       Action
1    Adventure
1     Thriller
2       Comedy
2      Romance
3        Crime
3        Drama
3     Thriller
4        Drama
4      History
4      Romance
4       Comedy
5     Thriller
Name: genres_all, dtype: object


genres_all,Action,Adventure,Comedy,Crime,Drama,History,Romance,Thriller
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,1,0,1,0,0,1
1,1,1,0,0,0,0,0,1
2,0,0,1,0,0,0,1,0
3,0,0,0,1,1,0,0,1
4,0,0,1,0,1,1,1,0
5,0,0,0,0,0,0,0,1


One-hot encoding: It was exploded out the column of lists into one-hot categorical columns. This was done by creating a single column for each categorical value (genres) and setting the row value to 1 if the movie belongs to that category and 0 if it doesn’t. 

In [29]:
df[df_columns]

Unnamed: 0,budget,id,imdb_id,original_title,release_date,revenue,runtime,Drama,Thriller,Comedy,Action,Adventure,Romance,Crime,History
0,63000000,550,tt0137523,Fight Club,1999-10-15,100853753,139,1,1,1,0,0,0,0,0
1,5000000,551,tt0069113,The Poseidon Adventure,1972-12-13,84563118,117,0,1,0,1,1,0,0,0
2,0,552,tt0237539,Pane e tulipani,2000-03-03,8478434,114,0,0,1,0,0,1,0,0
3,10000000,553,tt0276919,Dogville,2003-05-19,16680836,178,1,1,0,0,0,0,1,0
4,0,554,tt0308476,Кукушка,2002-01-01,0,100,1,0,1,0,0,1,0,1
5,0,555,tt0442896,Absolut,2005-04-20,0,94,0,1,0,0,0,0,0,0


Notice the genre columns to the right. If a movie belongs to a genre, the row value is 1, and if not, the value is 0. Now it’s easy for us to filter on specific genres and to quickly tell if a movie belongs to a genre or not.

### Working with datetimes
Finally we’ll expand out the datetime column into a table. Pandas has built-in functions to extract specific parts of a datetime. Notice we need to convert the release_date column into a datetime first.

In [34]:
df['release_date'] = pd.to_datetime(df['release_date'])

# creating columns relating to datetime
df['day'] = df['release_date'].dt.day
df['month'] = df['release_date'].dt.month
df['year'] = df['release_date'].dt.year
df['day_of_week'] = df['release_date'].dt.day_name()

In [36]:
df_time_columns = ['id', 'release_date', 'day', 'month', 'year', 'day_of_week']

df[df_time_columns]

Unnamed: 0,id,release_date,day,month,year,day_of_week
0,550,1999-10-15,15,10,1999,Friday
1,551,1972-12-13,13,12,1972,Wednesday
2,552,2000-03-03,3,3,2000,Friday
3,553,2003-05-19,19,5,2003,Monday
4,554,2002-01-01,1,1,2002,Tuesday
5,555,2005-04-20,20,4,2005,Wednesday


## Load
We ended up creating 3 tables for the tmdb schema that we’ll call movies, genres, and datetimes. 
We export our tables by writing them to file. This will create 3 .csv files in the same directory that our script is in.

In [None]:
df[df_columns].to_csv('tmdb_movies.csv', index=False)
df_genres.to_csv('tmdb_genres.csv', index=False)
df[df_time_columns].to_csv('tmdb_datetimes.csv', index=False)

In [38]:
# df

That’s it! We’ve created our first ETL pipeline. <br><br> We built a structured schema from a list of JSON records and transformed our data into a clean, usable format. <br>You can experiment with many different ways to explore and structure this data, as we have only scratched the surface here and used a small part of what we had available.

# Source

https://towardsdev.com/create-an-etl-pipeline-in-python-with-pandas-in-10-minutes-6be436483ec9

https://github.com/habibdraft/tmdb/blob/main/tmdb.py

<br>


## Relevant comments 
- This is not really an ETL, but you may improve your work using the pipe() function from pandas.