<a href="https://colab.research.google.com/github/akshgit10/hostpitalmngt-ui-react/blob/main/SpeechSynthesisBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)
        return text.strip()
    else:
        return ''

def read_data(file):
    df = pd.read_csv(file)
    df['Clean_Text'] = df['Clean_Text'].apply(preprocess_text)
    return df[['Emotion', 'Clean_Text']]

file = '/content/emotion_dataset_2.csv'
data = read_data(file)

# Convert emotions to numerical labels
emotion_map = {'joy': 0, 'sadness': 1, 'neutral': 2, 'surprise': 3, 'anger': 4, 'fear': 5, 'shame': 6, 'disgust': 7}
data['Emotion'] = data['Emotion'].map(emotion_map)

texts = data['Clean_Text'].tolist()
labels = data['Emotion'].tolist()

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a PyTorch dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create data loaders
train_dataset = EmotionDataset(X_train, y_train, tokenizer)
val_dataset = EmotionDataset(X_val, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
!pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->

In [4]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm
import pandas as pd

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_text(text):
    # Preprocess text as needed (e.g., lowercasing, removing stop words)
    return text

def create_feature(text):
    # Tokenize the text using BERT tokenizer
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()

def convert_label(emotion):
    # Map emotion to numerical label
    emotion_map = {'joy': 0, 'sadness': 1, 'neutral': 2, 'surprise': 3, 'anger': 4, 'fear': 5, 'shame': 6, 'disgust': 7}
    return emotion_map[emotion]

def load_data(file_path):
    df = pd.read_csv(file_path)
    df['text'] = df['Text'].apply(preprocess_text)
    return df

def create_dataset(df):
    # Create features and labels
    X = [create_feature(text) for text in df['text']]
    y = [convert_label(emotion) for emotion in df['Emotion']]
    return X, y
def create_dataloader(X, y, batch_size):
    input_ids = torch.stack([x[0] for x in X])
    attention_masks = torch.stack([x[1] for x in X])
    labels = torch.tensor(y)

    # Create PyTorch dataset and dataloader
    dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader


def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss, total_correct = 0, 0
        train_bar = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}")

        for batch in train_bar:
            optimizer.zero_grad()
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            preds = outputs.logits.argmax(dim=1)

            total_correct += (preds == labels).sum().item()
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

            train_bar.set_postfix({'loss': loss.item(), 'accuracy': total_correct / len(train_loader.dataset)})

        # Validation phase
        model.eval()
        val_correct, val_labels, val_preds = 0, [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = outputs.logits.argmax(dim=1)
                val_correct += (preds == labels).sum().item()
                val_labels.extend(labels.cpu().numpy())
                val_preds.extend(preds.cpu().numpy())

        # Print results after each epoch
        train_accuracy = total_correct / len(train_loader.dataset)
        val_accuracy = val_correct / len(val_loader.dataset)
        val_f1 = f1_score(val_labels, val_preds, average="weighted")

        print(f'Epoch {epoch + 1}/{epochs}')
        print(f'Train Loss: {total_loss / len(train_loader):.4f}')
        print(f'Train Accuracy: {train_accuracy:.2f}')
        print(f'Validation Accuracy: {val_accuracy:.2f}')
        print(f'Validation F1 Score: {val_f1:.2f}')

# Load data
file_path = 'emotion_dataset_2.csv'
df = load_data(file_path)

# Create features and labels
X, y = create_dataset(df)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Create dataloaders
train_loader = create_dataloader(X_train, y_train, batch_size=32)
val_loader = create_dataloader(X_val, y_val, batch_size=32)

# Create model, optimizer, and scheduler
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)

# Train the model
train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=3)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Epoch 1/3:   0%|          | 0/870 [00:00<?, ?it/s]

Epoch 1/3
Train Loss: 1.0873
Train Accuracy: 0.61
Validation Accuracy: 0.71
Validation F1 Score: 0.71


Training Epoch 2/3:   0%|          | 0/870 [00:00<?, ?it/s]

Epoch 2/3
Train Loss: 0.6636
Train Accuracy: 0.78
Validation Accuracy: 0.73
Validation F1 Score: 0.73


Training Epoch 3/3:   0%|          | 0/870 [00:00<?, ?it/s]

Epoch 3/3
Train Loss: 0.4731
Train Accuracy: 0.85
Validation Accuracy: 0.73
Validation F1 Score: 0.73


In [5]:
import os

# Directory to save the model and tokenizer
save_directory = "./emotion_detection_model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Optionally, save the optimizer and scheduler states (if you want to resume training later)
torch.save(optimizer.state_dict(), os.path.join(save_directory, "optimizer.pt"))
torch.save(scheduler.state_dict(), os.path.join(save_directory, "scheduler.pt"))


In [6]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained(save_directory)
tokenizer = BertTokenizer.from_pretrained(save_directory)

# If you saved the optimizer and scheduler states, you can load them too
optimizer.load_state_dict(torch.load(os.path.join(save_directory, "optimizer.pt")))
scheduler.load_state_dict(torch.load(os.path.join(save_directory, "scheduler.pt")))

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [7]:
def predict(text):
    model.eval()  # Set the model to evaluation mode

    # Preprocess and tokenize the input text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Make the prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()

    # Map the prediction to the corresponding emotion label
    emotion_map = {0: 'joy', 1: 'sadness', 2: 'neutral', 3: 'surprise', 4: 'anger', 5: 'fear', 6: 'shame', 7: 'disgust'}
    predicted_emotion = emotion_map[prediction]

    return predicted_emotion

# Example usage
text = "I am delighted to hear of your success. Well done!"
predicted_emotion = predict(text)
print(f"Predicted emotion: {predicted_emotion}")


Predicted emotion: joy


In [10]:
pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [8]:
pip install gtts librosa matplotlib

Collecting gtts
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.3-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.3


In [None]:
'''from pydub import AudioSegment
from gtts import gTTS

def text_to_speech(text, lang='en', emotion=predicted_emotion):
    # Generate basic speech with gTTS
    tts = gTTS(text=text, lang=lang)
    audio_file = 'output.mp3'
    tts.save(audio_file)

    # Load the audio file
    sound = AudioSegment.from_mp3(audio_file)

    # Adjust the audio properties based on the emotion
    if emotion == 'joy':
        sound = sound.speedup(playback_speed=1.2)  # Faster and maybe higher pitch for joy
    elif emotion == 'sadness':
        sound = sound.speedup(playback_speed=0.8)  # Slower for sadness
        sound = sound.low_pass_filter(300)  # Lower pitch
    elif emotion == 'surprise':
        sound = sound.speedup(playback_speed=1.3)  # Very fast for surprise
        sound = sound.high_pass_filter(3000)  # Higher pitch
    elif emotion == 'anger':
        sound = sound.speedup(playback_speed=1.1)  # Slightly faster for anger
        sound = sound.high_pass_filter(2000)  # Increase pitch slightly
    elif emotion == 'fear':
        sound = sound.speedup(playback_speed=0.9)  # Slightly slower for fear
        sound = sound.low_pass_filter(500)  # Decrease pitch
    elif emotion == 'shame':
        sound = sound.speedup(playback_speed=0.7)  # Slow and low pitch for shame
    elif emotion == 'disgust':
        sound = sound.speedup(playback_speed=0.85)  # Slow with a hint of low pitch

    # Save the modified audio file
    output_file = f'output_{emotion}.mp3'
    sound.export(output_file, format="mp3")

    return output_file

# Example usage

emotion = predicted_emotion
audio_file = text_to_speech(text, emotion=emotion)'''


In [11]:
# First, predict the emotion of the input text
#text = "I am delighted to hear of your success. Well done!"
#predicted_emotion = predict(text)
#print(f"Predicted emotion: {predicted_emotion}")

from pydub import AudioSegment
from gtts import gTTS

def text_to_speech(text, lang='en', emotion='neutral'):
    # Generate basic speech with gTTS
    tts = gTTS(text=text, lang=lang)
    audio_file = 'output.mp3'
    tts.save(audio_file)

    # Load the audio file
    sound = AudioSegment.from_mp3(audio_file)

    # Adjust the audio properties based on the emotion
    if emotion == 'joy':
        sound = sound.speedup(playback_speed=1.2)  # Faster and maybe higher pitch for joy
    elif emotion == 'sadness':
        sound = sound.speedup(playback_speed=0.8)  # Slower for sadness
        sound = sound.low_pass_filter(300)  # Lower pitch
    elif emotion == 'surprise':
        sound = sound.speedup(playback_speed=1.3)  # Very fast for surprise
        sound = sound.high_pass_filter(3000)  # Higher pitch
    elif emotion == 'anger':
        sound = sound.speedup(playback_speed=1.1)  # Slightly faster for anger
        sound = sound.high_pass_filter(2000)  # Increase pitch slightly
    elif emotion == 'fear':
        sound = sound.speedup(playback_speed=0.9)  # Slightly slower for fear
        sound = sound.low_pass_filter(500)  # Decrease pitch
    elif emotion == 'shame':
        sound = sound.speedup(playback_speed=0.7)  # Slow and low pitch for shame
    elif emotion == 'disgust':
        sound = sound.speedup(playback_speed=0.85)  # Slow with a hint of low pitch

    # Save the modified audio file
    output_file = f'output_{emotion}.mp3'
    sound.export(output_file, format="mp3")

    return output_file

# Now, convert the text to speech based on the predicted emotion
audio_file = text_to_speech(text, emotion=predicted_emotion)
print(f"Generated speech audio file: {audio_file}")


Generated speech audio file: output_joy.mp3
