Nesse notebook, extrairemos informações de uma lista de vídeos do YouTube, utilizando pandas, API do YouTube e a lib pytube. O json resultante será salvo como delta table no S3.


Fase inicial: importações e autenticação

In [None]:
#Instalando bibliotecas: pytube, pandas, deltalake e boto3

!pip install pytube
!pip install pandas
!pip install pytest
!pip install boto3


Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
Collecting boto3
  Downloading boto3-1.28.0-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.7/135.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.32.0,>=1.31.0 (from boto3)
  Downloading botocore-1.31.0-py3-none-any.whl (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0 (from boto3)
  Downloading s3transfer-0.6.1-py3

In [None]:
#Importações

from googleapiclient.discovery import build

import pandas as pd

from IPython.display import JSON

import os

from pytube import YouTube, Search, Channel, Playlist, extract

from getpass import getpass

import boto3

from getpass import getpass



Autenticação

In [None]:

#Informações básicas
api_key = 'AIzaSyChIrsnPA7b_a5TREcCXb2pb_UXE8S79zc'
api_service_name = "youtube"
api_version = "v3"

#Autenticando
youtube_client_obj = build(api_service_name, api_version, developerKey=api_key)


Fase de execução: classe e métodos

Construindo os dados do projeto

In [None]:
from typing import List
import pandas as pd

"""Criando a classe e o método construtor.
O construtor inicializa uma instância recebendo os argumentos por parâmetro, que é o objeto da autenticação, referenciando a API do youtube"""

class YouTubeVideoDetails:
    def __init__(self, youtube_client):
        self.youtube = youtube_client

    def _fetch_video_details(self, video_ids: List[str]) -> List[dict]:
        """Método fetch_video_details que recebe uma lista de id's dos vídeos.
        Através das functions annotations, podemos indicar mais facilmente que a lista de video ids é uma lista com objetos do tipo string,
        retornando uma lista de dicionários

        Parâmetros:
          video_ids, uma lista com objetos string contendo os ids dos vídeos

        Retorna:
           lista de dicionários com informações para manter de cada vídeo"""

        video_info_list = []

        #For loop com step size de 50, iterando pela lista de video ids e criando uma requisição
        for i in range(0, len(video_ids), 50):
            request = self.youtube.videos().list(
                part="snippet,contentDetails,statistics",
                id=','.join(video_ids[i:i+50])
            )

            #Execução da requisição
            response = request.execute()

            #For loop iterando sobre a resposta, indicando os stats to keep, padrão de resposta da API do YouTube
            for video in response['items']:
                video_info = {
                    'video_id': video['id'],
                    'channel_title': video['snippet'].get('channelTitle'),
                    'title': video['snippet'].get('title'),
                    'description': video['snippet'].get('description'),
                    'tags': video['snippet'].get('tags'),
                    'published_at': video['snippet'].get('publishedAt'),
                    'view_count': video['statistics'].get('viewCount'),
                    'like_count': video['statistics'].get('likeCount'),
                    'favorite_count': video['statistics'].get('favoriteCount'),
                    'comment_count': video['statistics'].get('commentCount'),
                    'duration': video['contentDetails'].get('duration'),
                    'definition': video['contentDetails'].get('definition'),
                    'caption': video['contentDetails'].get('caption')
                }

                video_info_list.append(video_info)

        return video_info_list

    def get_video_details(self, video_ids: List[str]) -> pd.DataFrame:
        """Método get_video_details que recebe uma lista de videos_id, como a gerada pelo método fetch_video_details

        Parâmetros:
          video_ids, uma lista com objetos string contendo os ids dos videos

        Retorna:
          dataframe pandas"""
        video_info_list = self._fetch_video_details(video_ids)
        df = pd.DataFrame(video_info_list)
        return df



Fase final: gerando ids e salvando o dataframe pandas como delta lake

Gerando video ids e retornando o dataframe pandas

In [None]:
#Lista de playlists
play_complete = ['https://www.youtube.com/playlist?list=PL1v8zpldgH3qQB5Pz6ZSTTDLu0BjAJYNf', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3oNcr8es3ov4_4DF8K0Ps6-', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3oeP7PBttxM7esceVXD63_v', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3pXjOUhfPVH3EhW4WMHVYPh', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3rJk6UKP_npByDuE7v1WSdt', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3pQwRz1FORZdChMaNZaR3pu', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3oZGs7Z_sCtp4ND_FLqTssn', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3pdP0S8WTmL5tKgPSZb-rME', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3pKAZxzSqWTfWRyPFHmSS5e', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3pXDttKKp8mlVKDitxsYDAp', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3rYbRYgJbM1ifITbNkqaTsM', 'https://www.youtube.com/playlist?list=PL1v8zpldgH3pR7LPuidEZK68kS6AaU1y7', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6Ec-XTbcX1uRg2_u4xOEky0', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6Hn0vK8co82zjQtt3T2Nkqc', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6E7jZ9sN_xHwSHOdjUxUW_b', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6FcbHlDzbVzf3TVgxzxK7lr', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6FIVXnB3nj6razI_m4PKoBC', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6Gqf52H_kkdKTRdn3JMBmDo', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6F6wUI9tvS_Gw1vaFAx6rd6', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6Gl29AoE31iwdVwSG-KnDzF', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6Hmo-Hbqp00dRCrDcOV5AYr', 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6GMoA0wbpJLi3t34Gd8l0aK']

video_id = []

#For loop sobre a lista de playlists
for playlist in play_complete:
    p = Playlist(playlist)
    for url in p.video_urls:
        id = extract.video_id(url)
        video_id.append(id)

#Instanciando a classe
youtube_client = youtube_client_obj
youtube_video_details = YouTubeVideoDetails(youtube_client_obj)

df_videos = youtube_video_details.get_video_details(video_id)
print(df_videos.shape[0])




787


Montando o bucket S3 e salvando como delta lake

In [None]:
    #Prompt para credenciais da AWS
    aws_access_key_id = getpass('Enter your AWS Access Key ID:')
    aws_secret_access_key = getpass('Enter your AWS Secret Access Key:')

    #Passando as credenciais para variáveis do ambiente
    os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id
    os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_access_key


    #Client S3 para python e salvando deltalake
    s3_client = boto3.client('s3')
    df_videos.to_parquet('/content/parquet/video_details.parquet')

    #Salvando delta table no s3 bucket
    for file in os.listdir('/content/parquet'):
      if file.endswith('.parquet'):
        #Informações do bucket, o nome desejado do arquivo e o upload
        bucket_name = 'youtube-video-details'
        file_path = f'video_data/raw/parquet/{file}'
        s3_client.upload_file(f'/content/parquet/{file}', bucket_name, file_path)





Enter your AWS Access Key ID:··········
Enter your AWS Secret Access Key:··········
