In [1]:
import os
import torch
import pandas as pd
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
from diffusers import StableDiffusionPipeline
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import librosa

In [2]:
stemmer = PorterStemmer()

In [3]:
def preprocess_text_with_stemming(text):
    """
    Preprocesses the text by applying stemming.
    Parameters:
    - text (str): Input text for stemming.
    
    Returns:
    - stemmed_text (str): Text after stemming.
    """
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(stemmed_tokens)

In [4]:
def analyze_sentiment(text, sentiment_pipeline):
    """
    Analyzes the sentiment of the given text.
    Parameters:
    - text (str): Input text for sentiment analysis.
    - sentiment_pipeline (pipeline): Hugging Face sentiment analysis pipeline.
    
    Returns:
    - sentiment (str): Sentiment label (e.g., "positive", "neutral", "negative").
    """
    sentiment_result = sentiment_pipeline(text)
    sentiment = sentiment_result[0]["label"]
    return sentiment

In [5]:
def process_audio_to_image_with_dataset(
    dataset_path,
    whisper_model_path,
    sd_model_path,
    audio_files_directory
):
    """
    Processes an audio dataset to generate transcriptions, analyze sentiments, and optionally create images.
    
    Parameters:
    - dataset_path (str): Path to the dataset CSV file containing 'File_name' and 'phrase' columns.
    - whisper_model_path (str): Path to the fine-tuned Whisper ASR model.
    - sd_model_path (str): Path or Hugging Face model ID for the Stable Diffusion model.
    - audio_files_directory (str): Directory where audio files are located.
    
    Returns:
    - results (list of dicts): A list of results containing transcription, sentiment, and optionally generated images.
    """
    # Load the dataset
    dataset = pd.read_csv(dataset_path)  # Ensure the dataset is structured correctly

    # Step 1: Load the fine-tuned Whisper model using Wav2Vec2ForCTC and Wav2Vec2Processor
    processor = Wav2Vec2Processor.from_pretrained(whisper_model_path)
    model = Wav2Vec2ForCTC.from_pretrained(whisper_model_path)
    
    # Step 2: Load the sentiment analysis model
    sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"  # Pre-trained model
    sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model_name)
    
    # Step 3: Load the Stable Diffusion model
    sd_pipeline = StableDiffusionPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float32)  # CPU mode
    sd_pipeline.to("cpu")  # Ensure it runs on CPU

    results = []

    for _, row in dataset.iterrows():
        audio_file_name = row['File_name']  # Get the file name directly from the dataset
        
        # Construct the full audio file path using the directory
        audio_file_path = os.path.join(audio_files_directory, audio_file_name)
        
        # Check if the file exists before processing
        if os.path.exists(audio_file_path):
            # Transcribe the audio
            audio_data, _ = librosa.load(audio_file_path, sr=16000)  # Load audio file with librosa
            input_values = processor(audio_data, return_tensors="pt").input_values  # Preprocess audio for the model
            logits = model(input_values).logits  # Get model predictions
            
            # Detach the tensor to avoid gradient tracking and convert to NumPy
            transcription_logits = logits.detach().numpy()
            transcription = processor.decode(transcription_logits[0], skip_special_tokens=True)  # Decode the logits to text
            
            # Analyze sentiment
            sentiment = analyze_sentiment(preprocessed_text, sentiment_pipeline)

            # Generate image if sentiment is not negative
            if "negative" not in sentiment.lower():
                generated_image = sd_pipeline(preprocessed_text).images[0]
            else:
                generated_image = None
            
            # Store results
            results.append({
                "File_name": audio_file_name,
                "Transcription": transcription,
                "Preprocessed_Text": preprocessed_text,
                "Sentiment": sentiment,
                "Generated_Image": generated_image
            })
        else:
            print(f"Audio file not found: {audio_file_path}")

    return results

In [6]:
dataset_path = "C:/Users/HP/OneDrive/Desktop/Infosys Springboard/Dataset/Recordings/audio__details.csv"
whisper_model_path = "C:/Users/HP/OneDrive/Desktop/Infosys Springboard/whisper-finetuned"
sd_model_path = "stabilityai/stable-diffusion-2-1"
audio_files_directory = "C:/Users/HP/OneDrive/Desktop/Infosys Springboard/Dataset/Recordings/Train"

In [7]:
if __name__ == "__main__":
    results = process_audio_to_image_with_dataset(dataset_path, whisper_model_path, sd_model_path, audio_files_directory)

    # Display results
    for result in results:
        print(f"File: {result['File_name']}")
        print(f"Transcription: {result['Transcription']}")
        print(f"Preprocessed Text: {result['Preprocessed_Text']}")
        print(f"Sentiment: {result['Sentiment']}")
        if result['Generated_Image']:
            result['Generated_Image'].show()  # Display the generated image
        else:
            print("Image generation skipped due to negative sentiment.")

You are using a model of type whisper to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at C:/Users/HP/OneDrive/Desktop/Infosys Springboard/whisper-finetuned and are newly initialized: ['encoder.layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layers.0.attention.k_proj.bias', 'encoder.layers.0.attention.k_proj.weight', 'encoder.layers.0.attention.out_proj.bias', 'encoder.layers.0.attention.out_proj.weight', 'encoder.layers.0.attention.q_proj.bias', 'encoder.layers.0.attention.q_proj.weight', 'encoder.layers.0.attention.v_proj.bias', 'encoder.layers.0.attention.v_proj.weight', 'encoder.layers.0.feed_forward.intermediate_dense.bias', 'encoder.layers.0.feed_forward.intermediate_dense.weight', 'encoder.layers.0.feed_forward.output_dense.bias', 'encoder.layers.0.feed_forward.output_dense.weight', 'encoder.layers.0.final_layer_norm.bias', 'e




Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


TypeError: argument 'ids': 'list' object cannot be interpreted as an integer