In [2]:
pip install opencv-python moviepy openai-whisper torch torchvision transformers xgboost imbalanced-learn pandas numpy scikit-learn

Note: you may need to restart the kernel to use updated packages.


## Importação das bibliotecas

In [10]:
import os
import cv2
import whisper
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from transformers import AutoTokenizer, AutoModel

# Configurações
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VIDEO_DIR = "data/dataset/videos"
DATASET_PATH = "data/dataset_final.csv"

SyntaxError: invalid syntax (1751603475.py, line 12)

## Processamento de Vídeo

In [4]:
import cv2
from PIL import Image
import torch
import clip

def extract_frames(video_path, frame_rate=1):
    """Extrai frames de um vídeo usando OpenCV"""
    cap = cv2.VideoCapture(video_path)
    frames = []
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps / frame_rate)
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if len(frames) % frame_interval == 0:
            frames.append(frame)
    cap.release()
    return frames

def transcribe_audio(video_path):
    """Transcreve áudio com Whisper"""
    model = whisper.load_model("base", device=DEVICE)
    result = model.transcribe(video_path)
    return result["text"]

## Extração de Embeddings

In [5]:
import torch.nn as nn

def get_visual_embeddings(frames):
    """Gera embeddings visuais com CLIP e projeta para o espaço de 768 dimensões"""
    model, preprocess = clip.load("ViT-B/32", device=DEVICE)
    embeddings = []
    
    for frame in frames:
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)
        image_tensor = preprocess(pil_image).unsqueeze(0).to(DEVICE)
        
        with torch.no_grad():
            embedding = model.encode_image(image_tensor)
            embeddings.append(embedding.cpu().numpy().squeeze())
    
    # Média dos embeddings dos frames
    visual_embedding = np.mean(embeddings, axis=0)
    
    # Projeção para 768 dimensões
    projection_layer = nn.Linear(512, 768)
    visual_embedding = torch.tensor(visual_embedding).unsqueeze(0).to(DEVICE)
    visual_embedding = projection_layer(visual_embedding).squeeze()
    
    # Desanexa o tensor e move para a CPU antes de converter para NumPy
    visual_embedding_np = visual_embedding.detach().cpu().numpy()
    
    return visual_embedding_np



def get_text_embeddings_bert(text, model_name='neuralmind/bert-base-portuguese-cased'):
    """Gera embeddings textuais com BERT"""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(
        text,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=512
    )
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [6]:
import pandas as pd

# Lista de vídeos
videos = [
    ("modified_Video1.mp4", 0),
    ("modified_Video8.mp4", 0)
]

# Adiciona os 13 vídeos originais (target 1)
for i in range(2, 16):
    if i != 8:  # Pula o vídeo 8, já que ele está na lista de modificados
        videos.append((f"Video{i}.mp4", 1))

# Cria DataFrame
df = pd.DataFrame(videos, columns=["video_path", "target"])

# Salva o CSV
df.to_csv("data/dataset/metadata.csv", index=False)

## Carregamento de metadados com target (Luan analise essa parte para combinar com a extracão de embbedings)

In [9]:
# Carrega metadados (rótulos manuais)
metadata = pd.read_csv("data/dataset/metadata.csv")  # Arquivo com colunas: [video_path, target]

# Processa cada vídeo
all_data = []

for idx, row in metadata.iterrows():
    video_path = os.path.join(VIDEO_DIR, row["video_path"])
    
    # Processamento de vídeo
    frames = extract_frames(video_path)
    visual_embedding = get_visual_embeddings(frames)
    
    # Processamento de áudio
    transcription = transcribe_audio(video_path)
    text_embedding = get_text_embeddings_bert(transcription)
    
    # Combina embeddings
    combined = np.concatenate([visual_embedding, text_embedding])
    all_data.append({
        "features": combined,
        "target": row["target"]
    })

    print("Em processamento")

# Cria DataFrame final
df = pd.DataFrame(all_data)

# Duplica os vídeos falsos até ter pelo menos 5 amostras
false_samples = df[df['target'] == 0]
df = pd.concat([df, false_samples.sample(5, replace=True)])


X = np.vstack(df["features"])
y = df["target"]

# Balanceamento com SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Treino-Teste Split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2)

# Treina XGBoost
model = XGBClassifier(
    objective='binary:logistic',
    n_estimators=200,
    max_depth=5,
    learning_rate=0.01
)
model.fit(X_train, y_train)

# Avaliação
y_pred = model.predict(X_test)
print("Relatório de Classificação:\n", classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_pred))

# Ajuste de Threshold (Opcional)
y_probs = model.predict_proba(X_test)[:, 1]
optimal_threshold = 0.4  # Definir via curva ROC
y_pred_adj = (y_probs > optimal_threshold).astype(int)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Em processamento


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       1.00      0.67      0.80         3

    accuracy                           0.83         6
   macro avg       0.88      0.83      0.83         6
weighted avg       0.88      0.83      0.83         6

AUC-ROC: 0.8333333333333333
