In [14]:
!pip install yt_dlp pydub git+https://github.com/openai/whisper.git git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-w_55qod0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-w_55qod0
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-eghtq4ya
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-eghtq4ya
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadat

In [17]:
import os
import cv2
import whisper
import numpy as np
from moviepy.editor import VideoFileClip
from pathlib import Path
from pydub import AudioSegment
import torch
from transformers import BertTokenizer, BertModel
from PIL import Image
import clip

In [12]:
# Caminhos dos diretórios
data_path = "/kaggle/input/dataset-correlation/videos"  # Diretório onde estão os vídeos
output_path = "dataset_preprocessado"
frames_dir = os.path.join(output_path, "frames")
audios_dir = os.path.join(output_path, "audios")
transcripts_dir = os.path.join(output_path, "transcricoes")

# Criar diretórios de saída
for directory in [frames_dir, audios_dir, transcripts_dir]:
    Path(directory).mkdir(parents=True, exist_ok=True)

def extract_frames(video_path, output_folder, frame_rate=1):
    """Extrai frames do vídeo a cada 'frame_rate' segundo."""
    cap = cv2.VideoCapture(video_path)
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    interval = fps * frame_rate  # Pular frames de acordo com a taxa
    count = 0
    frame_id = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if count % interval == 0:
            frame_filename = os.path.join(output_folder, f"frame_{frame_id:03d}.jpg")
            cv2.imwrite(frame_filename, frame)
            frame_id += 1
        
        count += 1
    
    cap.release()
    print(f"Frames extraídos e salvos em {output_folder}")

def extract_audio(video_path, audio_path):
    """Extrai o áudio do vídeo e salva como WAV."""
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path)
    sound = AudioSegment.from_file(audio_path, format="mp4")
    sound.export(audio_path, format="wav")

def transcribe_audio(audio_path, transcript_path):
    """Gera a transcrição do áudio usando Whisper."""
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    with open(transcript_path, "w", encoding="utf-8") as f:
        f.write(result["text"])
    print(f"Transcrição salva em {transcript_path}")

# Processamento dos vídeos
video_files = [f for f in os.listdir(data_path) if f.endswith(".mp4")]

for video_file in video_files:
    video_path = os.path.join(data_path, video_file)
    video_name = os.path.splitext(video_file)[0]
    
    # Criar pasta para os frames do vídeo
    video_frames_dir = os.path.join(frames_dir, video_name)
    Path(video_frames_dir).mkdir(parents=True, exist_ok=True)
    
    # Definir caminhos de saída
    audio_path = os.path.join(audios_dir, f"{video_name}.wav")
    transcript_path = os.path.join(transcripts_dir, f"{video_name}.txt")
    
    print(f"Processando {video_file}...")
    extract_frames(video_path, video_frames_dir)
    extract_audio(video_path, audio_path)
    transcribe_audio(audio_path, transcript_path)
    
print("Pré-processamento concluído!")

Processando Video12.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video12
MoviePy - Writing audio in dataset_preprocessado/audios/Video12.wav


                                                                    

MoviePy - Done.


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 70.8MiB/s]


Transcrição salva em dataset_preprocessado/transcricoes/Video12.txt
Processando Video15.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video15
MoviePy - Writing audio in dataset_preprocessado/audios/Video15.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video15.txt
Processando Video7.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video7
MoviePy - Writing audio in dataset_preprocessado/audios/Video7.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video7.txt
Processando Video13.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video13
MoviePy - Writing audio in dataset_preprocessado/audios/Video13.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video13.txt
Processando Video4.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video4
MoviePy - Writing audio in dataset_preprocessado/audios/Video4.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video4.txt
Processando Video6.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video6
MoviePy - Writing audio in dataset_preprocessado/audios/Video6.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video6.txt
Processando modified_Video1.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/modified_Video1
MoviePy - Writing audio in dataset_preprocessado/audios/modified_Video1.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/modified_Video1.txt
Processando modified_Video8.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/modified_Video8
MoviePy - Writing audio in dataset_preprocessado/audios/modified_Video8.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/modified_Video8.txt
Processando Video5.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video5
MoviePy - Writing audio in dataset_preprocessado/audios/Video5.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video5.txt
Processando Video11.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video11
MoviePy - Writing audio in dataset_preprocessado/audios/Video11.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video11.txt
Processando Video3.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video3
MoviePy - Writing audio in dataset_preprocessado/audios/Video3.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video3.txt
Processando Video14.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video14
MoviePy - Writing audio in dataset_preprocessado/audios/Video14.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video14.txt
Processando Video10.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video10
MoviePy - Writing audio in dataset_preprocessado/audios/Video10.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video10.txt
Processando Video9.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video9
MoviePy - Writing audio in dataset_preprocessado/audios/Video9.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video9.txt
Processando Video2.mp4...
Frames extraídos e salvos em dataset_preprocessado/frames/Video2
MoviePy - Writing audio in dataset_preprocessado/audios/Video2.wav


                                                                    

MoviePy - Done.
Transcrição salva em dataset_preprocessado/transcricoes/Video2.txt
Pré-processamento concluído!


In [19]:
# Caminhos dos dados preprocessados
TRANSCRIPTIONS_PATH = "/kaggle/working/dataset_preprocessado/transcricoes"
FRAMES_PATH = "/kaggle/working/dataset_preprocessado/frames"
OUTPUT_EMBEDDINGS_PATH = "/kaggle/working/dataset_preprocessado/embeddings"

# Criar diretório para armazenar embeddings
Path(OUTPUT_EMBEDDINGS_PATH).mkdir(parents=True, exist_ok=True)

# Carregar modelos pré-treinados
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device).eval()

def extract_text_embedding(text, model, tokenizer):
    """Extrai embeddings textuais usando BERT."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

def extract_image_embedding(image_path, model):
    """Extrai embeddings visuais usando CLIP."""
    image = Image.open(image_path).convert("RGB")
    image = clip_preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
    return image_features.squeeze().cpu().numpy()

# Processar transcrições e extrair embeddings textuais
text_embeddings = {}
for file in os.listdir(TRANSCRIPTIONS_PATH):
    if file.endswith(".txt"):
        video_name = file.replace(".txt", "")
        with open(os.path.join(TRANSCRIPTIONS_PATH, file), "r", encoding="utf-8") as f:
            text = f.read()
        embedding = extract_text_embedding(text, bert_model, tokenizer)
        np.save(os.path.join(OUTPUT_EMBEDDINGS_PATH, f"{video_name}_text.npy"), embedding)
        text_embeddings[video_name] = embedding

# Processar frames e extrair embeddings visuais
image_embeddings = {}
for video_folder in os.listdir(FRAMES_PATH):
    video_path = os.path.join(FRAMES_PATH, video_folder)
    if os.path.isdir(video_path):
        embeddings_list = []
        for img_file in sorted(os.listdir(video_path)):
            if img_file.endswith(".jpg"):
                img_path = os.path.join(video_path, img_file)
                embedding = extract_image_embedding(img_path, clip_model)
                embeddings_list.append(embedding)
        video_embedding = np.mean(embeddings_list, axis=0)  # Média dos embeddings do vídeo
        np.save(os.path.join(OUTPUT_EMBEDDINGS_PATH, f"{video_folder}_image.npy"), video_embedding)
        image_embeddings[video_folder] = video_embedding

print("Extração de embeddings concluída!")

Extração de embeddings concluída!


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

In [21]:
# Caminho dos embeddings
EMBEDDINGS_PATH = "/kaggle/working/dataset_preprocessado/embeddings"

# Dicionários para armazenar os embeddings
text_embeddings = {}
image_embeddings = {}

# Carregar embeddings textuais
for file in os.listdir(EMBEDDINGS_PATH):
    if file.endswith("_text.npy"):
        video_name = file.replace("_text.npy", "")
        text_embeddings[video_name] = np.load(os.path.join(EMBEDDINGS_PATH, file))

# Carregar embeddings visuais
for file in os.listdir(EMBEDDINGS_PATH):
    if file.endswith("_image.npy"):
        video_name = file.replace("_image.npy", "")
        image_embeddings[video_name] = np.load(os.path.join(EMBEDDINGS_PATH, file))

print(f"Carregados {len(text_embeddings)} embeddings textuais e {len(image_embeddings)} embeddings visuais.")

Carregados 15 embeddings textuais e 15 embeddings visuais.
