In [12]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import sys
import assemblyai as aai
import csv
import os
from datetime import datetime
import logging

# Suppress HTTP request logs
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("requests").setLevel(logging.WARNING)

sys.path.append(r'home/y2c/2024-25c-fai2-adsai-group-group_11_2c/Task_2')

In [19]:
### HELPER FUNCTIONS ###

# Loading model and tokenizer
def loading_model_tokenizer(model_path='../Task_5/DistilBERT/dbert_iter4', tokenizer_name="distilbert-base-uncased"):
    # I had to load the model with tensorflow since I had saved it using tf and not transformers
    model = tf.keras.models.load_model(model_path)

    # Since I did not change the tokenizer in the training by adding custom tokens or changing the vocabulary size, it is fine to use the base one from Transformers.
    tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_name)
    
    return model, tokenizer

# Changed task 2 for returning a dataframe without saving it into a csv file
def transcribe_audio_to_df(audio_file_path):
    # Set your API key
    aai.settings.api_key = "fb2df8accbcb4f38ba02666862cd6216"

    # Transcribe audio
    transcriber = aai.Transcriber()
    transcript = transcriber.transcribe(audio_file_path)

    # Extract sentences into a list of dicts
    data = []
    for i, sentence in enumerate(transcript.get_sentences(), 1):
        data.append({
            "Text": sentence.text,
            "Start Time (s)": round(sentence.start / 1000, 2),
            "End Time (s)": round(sentence.end / 1000, 2),
        })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

df = transcribe_audio_to_df('test.mp3')

def tokenize_texts(texts):
    return tokenizer(list(texts), truncation=True, padding=True, return_tensors="tf", max_length=128)

In [62]:
def pipeline(model, audio_file_path):

    valid_models = ['dbert', 'droberta', 'lstm', 'rnn', 'naive_bayes', 'logistic_regression']
    unique_labels = ['neutral', 'surprise', 'disgust', 'sadness', 'happiness', 'anger', 'fear']

    # Initialize and fit the LabelEncoder
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_labels)
    
    # Check if the passed model is valid
    if model not in valid_models:
        raise ValueError(f"Invalid model! Expected one of {valid_models}, but got '{model}'")

    ### 2. TRANSCRIBING AUDIO DATA (TASK 2) ###
    print("Exctracting the transcript of the audio...")
    #df = transcribe_audio_to_df(audio_file_data)

    ### 3. Loading the models based on input
    if model == 'dbert':
        print(f"Initializing {model}...")
        model, tokenizer = loading_model_tokenizer('../Task_5/DistilBERT/dbert_iter4')

        df_tokenized = tokenize_texts(df['Text'].tolist())

        # Prepare the tokenized input as a dictionary for the model
        inputs = {
            'input_ids': df_tokenized['input_ids'],
            'attention_mask': df_tokenized['attention_mask']
        }

        print('Getting the predictions...')

        # Get predictions from the model
        predictions = model.predict(inputs)
        
        # Extract logits from predictions
        logits = predictions['logits']  # Now we can directly access the logits

        # Get the predicted class by finding the maximum logit in each row
        results = np.argmax(logits, axis=-1)

        # Assuming 'results' is the array of predicted labels from your model
        decoded_emotions = label_encoder.inverse_transform(results)
        
        # Map the results to emotion names
        df['Emotion'] = decoded_emotions
        return df
        
    elif model == 'droberta':
        print(f"Initializing {model}...")
        model, tokenizer = loading_model_tokenizer('../Task_5/DistillRoberta/droberta_iter2')
        model = tf.keras.models.load_model('../Task_5/DistillRoberta/droberta_iter2')

        df_tokenized = tokenize_texts(df['Text'].tolist())

        # Prepare the tokenized input as a dictionary for the model
        inputs = {
            'input_ids': df_tokenized['input_ids'],
            'attention_mask': df_tokenized['attention_mask'],
            'token_type_ids': None  # or tf.zeros_like(inputs['input_ids']) if required
        }
    
        print('Getting the predictions...')
        
        # Get predictions from the model
        predictions = model.predict(inputs)
        
        # Extract logits from predictions
        logits = predictions['logits']  # Now we can directly access the logits

        # Get the predicted class by finding the maximum logit in each row
        results = np.argmax(logits, axis=-1)

        # Assuming 'results' is the array of predicted labels from your model
        decoded_emotions = label_encoder.inverse_transform(results)
        
        # Map the results to emotion names
        df['Emotion'] = decoded_emotions
        return df

    elif model == 'lstm':
        print(f"Initializing {model}...")
        model, tokenizer = loading_model_tokenizer(model)

    elif model == 'rnn':
        print(f"Initializing {model}...")
        model, tokenizer = loading_model_tokenizer(model)

    elif model == 'naive_bayes':
        print(f"Initializing {model}...")
        model, tokenizer = loading_model_tokenizer(model)

    else:
        print(f"Initializing {model}...")
        model, tokenizer = loading_model_tokenizer(model)


In [63]:
df = pipeline('droberta', 'test.mp3')
print(df)

Exctracting the transcript of the audio...
Initializing droberta...
Getting the predictions...


ValueError: Failed to find data adapter that can handle input: (<class 'dict'> containing {"<class 'str'>"} keys and {"<class 'NoneType'>", "<class 'tensorflow.python.framework.ops.EagerTensor'>"} values), <class 'NoneType'>