In [3]:
!pip install transformers pydub speechrecognition torch

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting speechrecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydub, speechrecognition
Successfully installed pydub-0.25.1 speechrecognition-3.11.0


In [None]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import speech_recognition as sr
from pydub import AudioSegment
from pydub.playback import play
import time
import os

# Load pre-trained models
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Speech-to-Text function
def transcribe_audio_chunk(audio_chunk, temp_file="temp_audio_chunk.wav"):
    recognizer = sr.Recognizer()

    audio_chunk.export(temp_file, format="wav")

    try:
        with sr.AudioFile(temp_file) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text
    except sr.UnknownValueError:
        return None
    except sr.RequestError as e:
        print(f"Error with Google API: {e}")
        return None
    finally:
        if os.path.exists(temp_file):
            os.remove(temp_file)

# Real-time sentiment analysis
def analyze_sentiment_in_real_time(audio_file, chunk_size=10):
    audio = AudioSegment.from_wav(audio_file)
    total_duration = audio.duration_seconds
    interval = min(chunk_size, total_duration)

    sentiment_history = []
    actionable_insights = []

    print(f"Total audio duration: {total_duration} seconds")

    for start in range(0, int(total_duration), interval):
        end = start + interval
        print(f"Processing from {start} to {end} seconds...")

        # Extract and transcribe
        audio_chunk = audio[start*1000:end*1000]  # milliseconds
        text_chunk = transcribe_audio_chunk(audio_chunk)

        if text_chunk:

            sentiment = sentiment_pipeline(text_chunk[:512])
            sentiment_history.append(sentiment)
            print(f"Sentiment: {sentiment}")

            actionable_cue = generate_actionable_feedback(sentiment)
            if actionable_cue:
                actionable_insights.append(actionable_cue)
                print(f"Actionable Insight: {actionable_cue}")
        else:
            print("Audio Unintelligible. Skipping this segment.")

        time.sleep(interval)
        play(audio_chunk)

    return sentiment_history, actionable_insights

# Generate feedback
def generate_actionable_feedback(sentiment):
    label = sentiment[0]['label']
    if label == '1 star' or label == '2 stars':
        return "Agent should calm the customer and acknowledge concerns."
    elif label == '4 stars' or '5 stars':
        return "Agent is doing well, continue the positive engagement."
    else:
        return None

# input audio file
audio_file = "/content/InboundSampleRecording-VEED.wav"
sentiment_history, actionable_insights = analyze_sentiment_in_real_time(audio_file)




Total audio duration: 120.832 seconds
Processing from 0 to 10 seconds...
Sentiment: [{'label': '5 stars', 'score': 0.48014259338378906}]
Actionable Insight: Agent is doing well, continue the positive engagement.
Processing from 10 to 20 seconds...
Sentiment: [{'label': '5 stars', 'score': 0.5405098795890808}]
Actionable Insight: Agent is doing well, continue the positive engagement.
Processing from 20 to 30 seconds...
Sentiment: [{'label': '4 stars', 'score': 0.39122387766838074}]
Actionable Insight: Agent is doing well, continue the positive engagement.
Processing from 30 to 40 seconds...
Sentiment: [{'label': '1 star', 'score': 0.5363258123397827}]
Actionable Insight: Agent should calm the customer and acknowledge concerns.
Processing from 40 to 50 seconds...
Sentiment: [{'label': '1 star', 'score': 0.4673415422439575}]
Actionable Insight: Agent should calm the customer and acknowledge concerns.
Processing from 50 to 60 seconds...
Audio Unintelligible. Skipping this segment.
Processi

In [4]:
!pip freeze > requirements.txt