In [None]:
import librosa
import os

import numpy as np
import matplotlib.pyplot as plt

import cv2
import numpy as np

import subprocess

import whisper

: 

In [None]:
def get_video(name):
    return "../uploads/" + name

# data creation
def extract_audio(name):
    y, sr = librosa.load(get_video(name), sr=None)
    frame_length = int(sr * 0.1)  # 100ms frames
    hop_length = frame_length

    rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
    times = librosa.frames_to_time(range(len(rms)), sr=sr, hop_length=hop_length)

    # Plot
    plt.plot(times, 20 * np.log10(rms))  # dB scale
    plt.xlabel("Time (s)")
    plt.ylabel("Volume (dB)")
    plt.title("Audio Levels Over Time")
    plt.show()
    
frame_dir = "frames"
os.makedirs(frame_dir, exist_ok=True)

def extract_frames(name):
    subprocess.run([
        "ffmpeg", "-i", name,
        "-vf", "fps=10",
        f"{frame_dir}/{os.path.basename(name)}_frame_%04d.png"
    ])

def get_frame_brightness(name):
    brightness_data = []

    for fname in sorted(os.listdir(frame_dir)):
        if fname.endswith(".png"):
            img = cv2.imread(os.path.join(frame_dir, fname))
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            brightness = np.mean(gray)
            brightness_data.append(brightness)
    
    return brightness_data
        
timestamps = [i * 0.1 for i in range(len(get_frame_brightness(name)))]
plt.plot(timestamps, get_frame_brightness(name))
plt.xlabel("Time (s)")
plt.ylabel("Brightness")
plt.title("Frame Brightness Over Time")
plt.grid(True)
plt.show()


extract_audio("[Kayoanime] Solo Leveling - S02E06.mkv")

In [None]:
# Dialouge detection and anger detection
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import datetime
import csv

input_video = "input.mkv"
output_audio = "audio.wav"
anger_threshold = 0.3
merge_gap = 3
csv_filename = "angry_sections.csv"

# Extract audio: 16kHz mono WAV (required for Whisper)
subprocess.run([
    "ffmpeg", "-y", "-i", input_video,
    "-vn",                      # No video
    "-acodec", "pcm_s16le",     # 16-bit PCM audio
    "-ar", "16000",             # 16kHz sampling rate
    "-ac", "1",                 # Mono
    output_audio
])

# Whipser model
model = whisper.load_model("base")  # change model size when tuning
result = model.transcribe(output_audio)

print("🔍 Loading emotion classifier...")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
emotion_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
labels = ['anger', 'joy', 'optimism', 'sadness']

# Angry per segment
angry_segments = []

for seg in transcription["segments"]:
    text = seg["text"]
    start = seg["start"]
    end = seg["end"]

    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
        probs = F.softmax(logits, dim=1)[0]

    anger_score = probs[labels.index("anger")].item()

    if anger_score > anger_threshold:
        angry_segments.append({
            "start": start,
            "end": end,
            "text": text.strip(),
            "anger_score": anger_score
        })

# Group angry segmenet
grouped_angry_sections = []

if angry_segments:
    current_group = [angry_segments[0]]
    for seg in angry_segments[1:]:
        prev = current_group[-1]
        if seg['start'] - prev['end'] <= merge_gap:
            current_group.append(seg)
        else:
            grouped_angry_sections.append(current_group)
            current_group = [seg]
    grouped_angry_sections.append(current_group)

# Rank groups and print
def sec_to_mmss(sec):
    return str(datetime.timedelta(seconds=int(sec)))

ranked_sections = []

for group in grouped_angry_sections:
    start = group[0]['start']
    end = group[-1]['end']
    avg_anger = sum(s['anger_score'] for s in group) / len(group)
    ranked_sections.append({
        "start": start,
        "end": end,
        "avg_anger": avg_anger,
        "segments": group
    })

# Sort by avg anger desc
ranked_sections.sort(key=lambda x: x["avg_anger"], reverse=True)

# Print full grouped anger sections
print("\n🔥 Angriest Sections in the Video:\n")
for i, section in enumerate(ranked_sections[:5]):  # Get top 5 section, hopefully enough?
    print(f"🔴 Section {i+1}")
    print(f"[{sec_to_mmss(section['start'])} → {sec_to_mmss(section['end'])}] | Avg Anger Score: {section['avg_anger']:.2f}\n")

    for seg in section["segments"]:
        print(f"  🕒 [{sec_to_mmss(seg['start'])} → {sec_to_mmss(seg['end'])}] | Anger: {seg['anger_score']:.2f}")
        print(f"  🗣️  {seg['text']}\n")

    print("------------------------------------------------------------\n")

with open(csv_filename, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "Section Number",
        "Section Start",
        "Section End",
        "Section Avg Anger",
        "Segment Start",
        "Segment End",
        "Segment Anger Score",
        "Segment Text"
    ])

    for i, section in enumerate(ranked_sections[:5], start=1):
        section_start = sec_to_mmss(section["start"])
        section_end = sec_to_mmss(section["end"])
        avg_anger = f"{section['avg_anger']:.2f}"

        for seg in section["segments"]:
            seg_start = sec_to_mmss(seg["start"])
            seg_end = sec_to_mmss(seg["end"])
            seg_anger = f"{seg['anger_score']:.2f}"
            seg_text = seg["text"].replace('\n', ' ').strip()
            writer.writerow([
                i,
                section_start,
                section_end,
                avg_anger,
                seg_start,
                seg_end,
                seg_anger,
                seg_text
            ])