In [1]:
!pip install yt_dlp pydub git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-3rv76up0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-3rv76up0
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting yt_dlp
  Downloading yt_dlp-2025.2.19-py3-none-any.whl.metadata (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.9/171.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting triton>=2 (from openai-whisper==20240930)
  Downloading triton-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading yt_dlp-2025.2.19-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [2]:
import os
import yt_dlp
import pandas as pd
import whisper
import cv2
import numpy as np
from pydub import AudioSegment
from moviepy.editor import VideoFileClip
from pathlib import Path

In [4]:
# Caminho do dataset
DATASET_PATH = "/kaggle/input/fakenwes-dataset/b5-correlaction-main"
OUTPUT_DIR = "preprocessed_data"

# Criar diretório de saída
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

def extract_frames(video_path, output_folder, fps=1):
    """Extrai frames do vídeo a cada 'fps' segundos usando OpenCV."""
    os.makedirs(output_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(frame_rate / fps) if frame_rate > 0 else 1
    count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if count % frame_interval == 0:
            frame_filename = os.path.join(output_folder, f'frame_{count:04d}.jpg')
            cv2.imwrite(frame_filename, frame)
        count += 1
    
    cap.release()
    cv2.destroyAllWindows()

def extract_audio(video_path, audio_path):
    """Extrai o áudio do vídeo e converte para WAV usando pydub."""
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path)
    sound = AudioSegment.from_file(audio_path, format="mp4")
    sound.export(audio_path, format="wav")

def transcribe_audio(audio_path):
    """Transcreve o áudio usando Whisper."""
    try:
        model = whisper.load_model("base")
        result = model.transcribe(audio_path)
        return result["text"]
    except AttributeError:
        raise ImportError("Erro ao carregar o modelo Whisper. Verifique a instalação com 'pip install openai-whisper'.")

In [5]:
if __name__ == "__main__":
    video_files = [f for f in os.listdir(DATASET_PATH) if f.endswith(".mp4")]
    
    for video_file in video_files:
        video_path = os.path.join(DATASET_PATH, video_file)
        video_name = os.path.splitext(video_file)[0]
        
        output_video_folder = os.path.join(OUTPUT_DIR, video_name)
        os.makedirs(output_video_folder, exist_ok=True)
        
        audio_path = os.path.join(output_video_folder, "audio.wav")
        frames_folder = os.path.join(output_video_folder, "frames")
        transcription_path = os.path.join(output_video_folder, "transcription.txt")
        
        print(f"Processando vídeo: {video_file}")
        print("Extraindo frames...")
        extract_frames(video_path, frames_folder)
        
        print("Extraindo áudio...")
        extract_audio(video_path, audio_path)
        
        print("Transcrevendo áudio...")
        transcription = transcribe_audio(audio_path)
        with open(transcription_path, "w") as f:
            f.write(transcription)
        
        print(f"Finalizado: {video_file}\n")
    
    print("Pré-processamento concluído!")

Processando vídeo: Video12.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video12/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...


100%|███████████████████████████████████████| 139M/139M [00:08<00:00, 17.6MiB/s]


Finalizado: Video12.mp4

Processando vídeo: Video15.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video15/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video15.mp4

Processando vídeo: Video7.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video7/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video7.mp4

Processando vídeo: Video13.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video13/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video13.mp4

Processando vídeo: Video4.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video4/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video4.mp4

Processando vídeo: Video1.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video1/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video1.mp4

Processando vídeo: Video6.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video6/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video6.mp4

Processando vídeo: Video5.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video5/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video5.mp4

Processando vídeo: Video11.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video11/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video11.mp4

Processando vídeo: Video3.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video3/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video3.mp4

Processando vídeo: Video14.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video14/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video14.mp4

Processando vídeo: Video10.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video10/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video10.mp4

Processando vídeo: Video8.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video8/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video8.mp4

Processando vídeo: Video9.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video9/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video9.mp4

Processando vídeo: Video2.mp4
Extraindo frames...
Extraindo áudio...
MoviePy - Writing audio in preprocessed_data/Video2/audio.wav


                                                                    

MoviePy - Done.
Transcrevendo áudio...
Finalizado: Video2.mp4

Pré-processamento concluído!
