In [1]:
import pandas as pd
import numpy as np
import requests
#import sys
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
load_dotenv()

True

In [47]:
class Movie_ETL:
    def __init__(self) -> None:        
        self._api_key =  os.getenv("TMDB_API_KEY") # Chave da API
        self._urlsystem = 'PASTA/'
        self.url = f"https://api.themoviedb.org/3/trending/movie/day?language=en-US"
        self.url_genre = "https://api.themoviedb.org/3/genre/movie/list?language=en"
        self.url_people = "https://api.themoviedb.org/3/trending/person/day?language=en-US"

    def extract(self):
        headers = {
             "accept": "application/json",
             "Authorization": f"Bearer {self._api_key}"
        }       
        movies_data = []

        for page in range(1,5):
            url = f"{self.url}?page={page}"
            response = requests.get(self.url, headers=headers)
        
            if response.status_code != 200:
                print(f"Erro na página {page}: {response.status_code}")
                break
        
            data = response.json()
        
            movies_data.extend(data['results']) 
        
        df = pd.DataFrame(movies_data)
        return df

    def extract_genre(self):
        headers = {
             "accept": "application/json",
             "Authorization": f"Bearer {self._api_key}"
        }         
        genre_data = []
        response = requests.get(self.url_genre, headers=headers)

        if response.status_code != 200:
            print(f"Erro na página = {response.status_code}")
        
        genre_data = response.json()['genres']

        df = pd.DataFrame(genre_data)
        return df
    
    def extract_popularity(self):
        headers = {
             "accept": "application/json",
             "Authorization": f"Bearer {self._api_key}"
        }       
        people_data = []

        for page in range(1,5):
            url = f"{self.url}?page={page}"
            response = requests.get(self.url, headers=headers)
        
            if response.status_code != 200:
                print(f"Erro na página {page}: {response.status_code}")
                break
        
            data = response.json()
        
            people_data.extend(data['results']) 
        
        df = pd.DataFrame(people_data)
        df = df[['id', 'title', 'popularity']]
        return df
    
    def intermediate_load_extract(self):
        resultado = self.extract()
        resultado.to_parquet('data_lake/Movies_sem_tratamento.parquet')
    
    def intermediate_load_genre(self):
        resultado = self.extract_genre()
        resultado.to_parquet('data_lake/Genres_sem_tratamento.parquet')

    def intermediate_load_popularity(self):
        resultado = self.extract()
        resultado.to_parquet('data_lake/Popularity_sem_tratamento.parquet')
    
    def transform(self):
        #leitura do parquet
        df = pd.read_parquet("data_lake/Movies_sem_tratamento.parquet")
       
        df['release_date'] = pd.to_datetime(df['release_date'])
        df['release_date'] = df['release_date'].dt.strftime('%d/%m/%Y')

        df.drop(columns=['backdrop_path', 'poster_path'],inplace=True)
        df = df.explode(column='genre_ids')

        df_genre = pd.read_parquet("data_lake/Genres_sem_tratamento.parquet")
        df = df.merge(df_genre,left_on='genre_ids',right_on='id')
        df.drop(columns='id_y', inplace=True)
        df.rename(columns={'name':'genre','id_x':'id_movie'},inplace=True)

        return df

    
    def final_load(self):
        load = self.transform()
        load.to_parquet('data_warehouse/movies_tratados.parquet')
    
    def run(self):
        self.extract()
        self.extract_genre()
        self.extract_popularity()
        self.intermediate_load_extract()
        self.intermediate_load_genre()
        self.intermediate_load_popularity()
        self.transform()
        self.final_load()      


In [48]:
movie1 = Movie_ETL()
run = movie1.run()