# Pipeline
This pipeline is designed to download Dutch audio from a YouTube video, transcibe it to English, classify the emotion per sentence and save the results in a CSV file with timestamps per sentence.

In [10]:
# Import necessary libraries
import os
import csv
import json
import torch
import whisper
import requests
import pandas as pd
from transformers import (
    MarianMTModel,
    MarianTokenizer
)
from datetime import datetime
from pytubefix import YouTube

In [11]:
# Connect to the API
TOKEN = "sk-75f81de4ee9e4eb1ba31a0b6f7721560"
API_BASE = "http://194.171.191.228:30080/"

In [12]:
# Define emotion mapping
emotions = ["neutral", "happiness", "anger", "fear", "sadness", "disgust", "surprise"]
emotion_mapping = """
    happiness: "joy", "optimism", "approval", "pride", "gratitude", "confidence",
                "satisfaction", "hope", "love", "excitement", "caring", "relief", "admiration",
                "amusement", "anticipation", "encouragement", "desire", "happiness"
    sadness: "disappointment", "nostalgia", "remorse", "pain", "stress", "regret",
              "resignation", "despair", "confusion", "uncertainty", "sadness"
    anger: "anger", "annoyance", "disapproval", "frustration", "disbelief",
            "warning", "rejection"
    disgust: "disgust"
    fear: "fear", "nervousness", "worry", "anxiety", "doubt", "insecurity", 
           "urgency", "panic"
    surprise: "surprise", "realization", "shock", "amazement", "wonder"
    neutral: "neutral", "mixed", "trust", "agreement", "instruction", "suggestion",
              "request", "confirmation", "acceptance", "reassurance", "clarification",
              "understanding", "certainty", "curiosity"
"""

EMO_PROMPT = f"""You will analyze sentences and classify emotions that best relate to the sentence. Please follow these steps:
    1. Identify the primary emotion expressed in the sentence. 
    2. Classify the sentences into the main emotions: {emotions}
    3. If trouble classifying refer to this mapping: {emotion_mapping}
    4. Only provide the MAIN emotion as the answer.
    5. You are NOT allowed to use more than one word as the answer.
    6. Do not give me the reasoning.
    7. If the sentence is unclear or the emotion is not strong, return "neutral".
    8. Only these 7 types of outputs are allowed: {emotions}

    Here is the sentence to analyze:
"""

In [None]:
def youtube_to_audio(youtube_url, output_path="."):
    """
    Download audio from YouTube and save as MP3.
    Args:
        youtube_url (str): YouTube video URL.
        audio_file (str): Output MP3 file path.
    Returns:
        str: Path to the downloaded MP3 file.
    """
    youtube = YouTube(youtube_url)
    video_file = youtube.streams.filter(only_audio=True).first()
    audio_file = video_file.download(output_path=output_path)
    mp3_file = os.path.splitext(audio_file)[0] + ".mp3"
    os.rename(audio_file, mp3_file)
    print(f"Audio downloaded to {audio_file}")
    return mp3_file

In [None]:
def format_timestamp(seconds: float) -> str:
    """
    Convert seconds to SRT timestamp format (HH:MM:SS,mmm).
    Args:
        seconds (float): Timestamp in seconds.
    Returns:
        str: Formatted timestamp (e.g., "00:00:02,000").
    """
    return datetime.utcfromtimestamp(seconds).strftime('%H:%M:%S,%f')[:-3]

def audio_transcription(input_file: str, 
                     output_path: str = "transcription.csv",
                     language: str = "nl"):
    """
    Transcribing audio and save with timestamps.
    Args:
        input_file (str): Path to the input audio file.
        output_path (str): Path to the output CSV file (default: "transcription.csv").
        language (str): Language code (e.g., "nl" for Dutch).
    Returns:
        dict: Full transcription result from Whisper.
    """
    model = whisper.load_model("large-v3")
    transcription = model.transcribe(input_file, language=language)
    
    # Write to CSV with timestamps
    with open(output_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Start Time", "End Time", "Sentence"])
        
        for segment in transcription['segments']:
            start_time = format_timestamp(segment['start'])
            end_time = format_timestamp(segment['end'])
            sentence = segment['text'].strip()
            writer.writerow([start_time, end_time, sentence])
    print(f"Transcription with timestamps saved to {output_path}")  
    return transcription

In [None]:
def load_model(model_path):
    """
    Load MarianMT translation model and tokenizer.
    Args:
        model_path (str): Path or name of the translation model.
    Returns:
        tuple: (tokenizer, model)
    """
    tokenizer = MarianTokenizer.from_pretrained(model_path)
    model = MarianMTModel.from_pretrained(model_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    return tokenizer, model
    
def audio_translation(text, tokenizer, model):
    """
    Translate text using MarianMT model.
    Args:
        text (str): Text to translate.
        tokenizer: MarianMT tokenizer.
        model: MarianMT model.
    Returns:
        str: Translated text.
    """
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        translated_ids = model.generate(**inputs)
    return tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    
def emotion_classification(sentence):
    """
    Classify emotion of a sentence using the API.
    Args:
        sentence (str): Sentence to classify.
    Returns:
        str: Classified emotion. If classification fails, returns 'neutral'.
    """
    url = f'{API_BASE}/api/chat/completions'
    headers = {
        'Authorization': f'Bearer {TOKEN}',
        'Content-Type': 'application/json'
    }
    prompt = [{"role": "system", "content": EMO_PROMPT},
              {"role": "user", "content": sentence}]
    
    response = requests.post(url, headers=headers, json={"model": "llama3.2:3b", "messages": prompt})    
    if response.status_code == 200:
        return response.json().get('choices', [{}])[0].get('message', {}).get('content', 'neutral')
    return 'neutral'

def save_transcription(sentences, translations, emotions, output_file="Pipeline.csv"):
    """
    Save transcription, translations, and emotions to CSV.
    Args:
        sentences (list): List of tuples (start, end, sentence).
        translations (list): List of translated sentences.
        emotions (list): List of classified emotions.
        output_file (str): Output CSV file path (default: "Pipeline.csv").
    Returns:
        str: Path to the saved CSV file.
    """
    df = pd.DataFrame(sentences, columns=["Start_Time", "End_Time", "Sentence"])
    df["Translation"] = translations
    df["Emotion"] = emotions
    df.to_csv(output_file, index=False)
    return output_file

In [None]:
def pipeline(youtube_url: str, model_path: str, output_csv: str = "pipeline_output.csv"):
    """
    This pipeline function orchestrates the entire process of:
    - downloading audio from a YouTube video
    - transcribing it using Whisper
    - translating the transcribed text
    - classifying emotions
    - saving the results to a CSV file.

    Args:
        youtube_url (str): YouTube video URL.
        model_path (str): Path to translation model.
        output_csv (str): Final output CSV path.
    """
    
    # 1. Extract audio from the YouTube video
    audio_file = youtube_to_audio(youtube_url)
    
    # 2. Generate transcriptions using Whisper
    transcription = audio_transcription(audio_file)
    
    # 3. Parse and format the transcription into sentence segments
    sentences = [
        (format_timestamp(segment['start']),
         format_timestamp(segment['end']),
         segment['text'].strip())
        for segment in transcription['segments']
    ]
    
    # 4. Load the translation model and tokenizer
    tokenizer, model = load_model(model_path)
    
    # 5. Perform translation and emotion analysis for each sentence
    translations = [audio_translation(text, tokenizer, model) for _, _, text in sentences]
    emotions = [emotion_classification(text) for _, _, text in sentences]

    # 6. Save all processed data to a CSV file
    output_file = save_transcription(sentences, translations, emotions, output_csv)
    print(f"Transcription and classification saved to file {output_file}")

In [17]:
# Usage
youtube_url = "https://youtu.be/TWIgWhJZJgo?si=JI6F7-VQyhIpxkHr"
model_path = "translater"

# Download audio
pipeline(youtube_url, model_path)

Audio downloaded to /home/y3/'ZEIKENDE' buurvrouw 'probeert PARKEERTUIG AAN TE RIJDEN!' | Mr. Frank Visser doet uitspraak #AFL123.m4a
Transcription with timestamps saved to transcription.csv
Transcription and classification saved to file pipeline_output.csv
