## Importação das bibliotecas

In [1]:
!pip install git+https://github.com/openai/whisper.git git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-eh4tdjcv
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-eh4tdjcv
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-_opani86
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-_opani86
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting triton>=2 (from openai-whisper==20240930)
  Downloading triton-3.2.0-cp

In [15]:
import os
import cv2
from PIL import Image
import whisper
import torch
import clip
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score

In [3]:
# Configurações globais
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VIDEO_DIR = "/kaggle/input/dataset-correlation/videos"
OUTPUT_DIR = "/kaggle/working"
METADATA_PATH = os.path.join(OUTPUT_DIR, "metadata.csv")

In [4]:
# Carregar modelos globalmente
WHISPER_MODEL = whisper.load_model("base", device=DEVICE)
CLIP_MODEL, CLIP_PREPROCESS = clip.load("ViT-B/32", device=DEVICE)
BERT_TOKENIZER = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
BERT_MODEL = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased').to(DEVICE)

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 102MiB/s]
  checkpoint = torch.load(fp, map_location=device)
100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 140MiB/s]


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

## Processamento de Vídeo

In [8]:
def extract_frames(video_path, frame_rate=1):
    """Extrai frames de um vídeo a uma taxa específica"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)
    frame_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            frames.append(frame)
        frame_count += 1
    cap.release()
    return frames

def transcribe_audio(video_path):
    """Transcreve o áudio do vídeo usando Whisper"""
    result = WHISPER_MODEL.transcribe(video_path)
    return result["text"]

## Extração de Embeddings

In [9]:
def get_visual_embeddings(frames):
    """Gera embeddings visuais com CLIP (média dos frames)"""
    embeddings = []
    
    for frame in frames:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)
        image_tensor = CLIP_PREPROCESS(pil_image).unsqueeze(0).to(DEVICE)
        
        with torch.no_grad():
            embedding = CLIP_MODEL.encode_image(image_tensor)
            embeddings.append(embedding.cpu().numpy().squeeze())
    
    return np.mean(embeddings, axis=0)  # Dimensão 512

def get_text_embeddings_bert(text):
    """Gera embeddings textuais com BERT (média dos tokens)"""
    inputs = BERT_TOKENIZER(
        text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    ).to(DEVICE)
    
    with torch.no_grad():
        outputs = BERT_MODEL(**inputs)
    
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()  # Dimensão 768

In [11]:
# Criar metadados (caso ainda não exista)
videos = [
    ("modified_Video1.mp4", 0),
    ("modified_Video8.mp4", 0)
]
for i in range(2, 16):
    if i != 8:
        videos.append((f"Video{i}.mp4", 1))

metadata = pd.DataFrame(videos, columns=["video_path", "target"])
metadata.to_csv(METADATA_PATH, index=False)

# Carregar metadados
metadata = pd.read_csv(METADATA_PATH)

# Processar cada vídeo
all_data = []

for idx, row in metadata.iterrows():
    video_path = os.path.join(VIDEO_DIR, row["video_path"])
    
    if not os.path.exists(video_path):
        print(f"Vídeo não encontrado: {video_path}")
        continue
    
    # Processamento de vídeo
    frames = extract_frames(video_path)
    visual_embedding = get_visual_embeddings(frames)
    
    # Processamento de áudio
    transcription = transcribe_audio(video_path)
    text_embedding = get_text_embeddings_bert(transcription)
    
    # Combina embeddings
    combined = np.concatenate([visual_embedding, text_embedding])  # 512 + 768 = 1280 dimensões
    all_data.append({
        "features": combined,
        "target": row["target"],
        "video_path": row["video_path"]
    })
    
    print(f"Processando: {row['video_path']}")

# Criar DataFrame final
df = pd.DataFrame(all_data)

# Duplicar vídeos falsos até ter pelo menos 5 amostras (se necessário)
false_samples = df[df['target'] == 0]
if len(false_samples) < 5:
    df = pd.concat([df, false_samples.sample(5 - len(false_samples), replace=True)])

# Preparar features e rótulos
X = np.vstack(df["features"])
y = df["target"]

# Balanceamento com SMOTE, ajustando k_neighbors
smote = SMOTE(k_neighbors=4, random_state=42)  # Ajustado para 4 vizinhos
X_res, y_res = smote.fit_resample(X, y)

# Divisão Treino-Teste
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

print("Balanceamento concluído com sucesso!")
print(f"Tamanho do conjunto balanceado: {X_res.shape[0]} amostras")

Processando: modified_Video1.mp4
Processando: modified_Video8.mp4
Processando: Video2.mp4
Processando: Video3.mp4
Processando: Video4.mp4
Processando: Video5.mp4
Processando: Video6.mp4
Processando: Video7.mp4
Processando: Video9.mp4
Processando: Video10.mp4
Processando: Video11.mp4
Processando: Video12.mp4
Processando: Video13.mp4
Processando: Video14.mp4
Processando: Video15.mp4
Balanceamento concluído com sucesso!
Tamanho do conjunto balanceado: 26 amostras


In [12]:
# Treinar XGBoost
model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=200,
    max_depth=5,
    learning_rate=0.01,
    random_state=42
)
model.fit(X_train, y_train)

# Avaliação
y_pred = model.predict(X_test)
print("\n=== Relatório de Classificação ===")
print(classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_pred))

# Ajuste de Threshold (opcional)
y_probs = model.predict_proba(X_test)[:, 1]
optimal_threshold = 0.4  # Ajuste conforme necessário (ex.: via curva ROC)
y_pred_adj = (y_probs > optimal_threshold).astype(int)
print("\n=== Relatório com Threshold Ajustado (0.4) ===")
print(classification_report(y_test, y_pred_adj))
print("AUC-ROC (ajustado):", roc_auc_score(y_test, y_pred_adj))

# Salvar resultados
results_df = pd.DataFrame({
    "video_path": df["video_path"],
    "target": df["target"],
    "features": list(X)  # embeddings combinados
})
results_df.to_csv(os.path.join(OUTPUT_DIR, "results_with_features.csv"), index=False)


=== Relatório de Classificação ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

AUC-ROC: 1.0

=== Relatório com Threshold Ajustado (0.4) ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

AUC-ROC (ajustado): 1.0


In [13]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print(f"Threshold ótimo: {optimal_threshold}")

Threshold ótimo: 0.8626664280891418


## Carregamento de metadados com target (Luan analise essa parte para combinar com a extracão de embbedings)

In [14]:
from collections import Counter
print("Distribuição antes do SMOTE:", Counter(y))

NameError: name 'y' is not defined

In [None]:
df = pd.DataFrame(X)
df['target'] = y

# Duplica os vídeos falsos até ter pelo menos 5 amostras
false_samples = df[df['target'] == 0]
df = pd.concat([df, false_samples.sample(3, replace=True)])  # Adiciona 3 cópias

X_augmented = df.drop(columns=['target'])
y_augmented = df['target']

smote = SMOTE(random_state=42, k_neighbors=1)
X_res, y_res = smote.fit_resample(X_augmented, y_augmented)

In [9]:
# Carrega metadados (rótulos manuais)
metadata = pd.read_csv("data/dataset/metadata.csv")  # Arquivo com colunas: [video_path, target]

# Processa cada vídeo
all_data = []

for idx, row in metadata.iterrows():
    video_path = os.path.join(VIDEO_DIR, row["video_path"])
    
    # Processamento de vídeo
    frames = extract_frames(video_path)
    visual_embedding = get_visual_embeddings(frames)
    
    # Processamento de áudio
    transcription = transcribe_audio(video_path)
    text_embedding = get_text_embeddings_bert(transcription)
    
    # Combina embeddings
    combined = np.concatenate([visual_embedding, text_embedding])
    all_data.append({
        "features": combined,
        "target": row["target"]
    })

    print("Em processamento")

# Cria DataFrame final
df = pd.DataFrame(all_data)
false_samples = df[df['target'] == 0]
df = pd.concat([df, false_samples.sample(5, replace=True)])
X = np.vstack(df["features"])
y = df["target"]

# Balanceamento com SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Treino-Teste Split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2)

# Treina XGBoost
model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=200,
    max_depth=5,
    learning_rate=0.01
)
model.fit(X_train, y_train)

# Avaliação
y_pred = model.predict(X_test)
print("Relatório de Classificação:\n", classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_pred))

# Ajuste de Threshold (Opcional)
y_probs = model.predict_proba(X_test)[:, 1]
optimal_threshold = 0.4  # Definir via curva ROC
y_pred_adj = (y_probs > optimal_threshold).astype(int)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.67      0.80         3

    accuracy                           0.83         6
   macro avg       0.88      0.83      0.83         6
weighted avg       0.88      0.83      0.83         6

AUC-ROC: 0.8333333333333333
