In [7]:
# Library
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

# Load dataset
data = pd.read_csv('eng_dataset.csv')

# Use 'content' for texts and 'sentiment' for labels
texts = data['content'].values
labels = data['sentiment'].values

# Preprocessing texts
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=100)

# Label encoding
label_dict = {label: idx for idx, label in enumerate(sorted(set(labels)))}
encoded_labels = np.array([label_dict[label] for label in labels])
print(label_dict)

# Model definition
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(len(label_dict), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(padded_sequences, encoded_labels, epochs=5, batch_size=10)

# Save the model
model.save('emotion_detection_model.keras')

# Function to predict emotion
def predict_emotion(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=100)
    prediction = model.predict(padded)
    emotion = list(label_dict.keys())[np.argmax(prediction)]
    return emotion

# Example usage
example_text = "I'm so happy to see this project working perfectly!"
predicted_emotion = predict_emotion(example_text)
print(f"Predicted Emotion: {predicted_emotion}")

{'anger': 0, 'fear': 1, 'joy': 2, 'sadness': 3}
Epoch 1/5




[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 73ms/step - accuracy: 0.4326 - loss: 1.1982
Epoch 2/5
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 71ms/step - accuracy: 0.9147 - loss: 0.2722
Epoch 3/5
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 68ms/step - accuracy: 0.9569 - loss: 0.1396
Epoch 4/5
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 65ms/step - accuracy: 0.9643 - loss: 0.1003
Epoch 5/5
[1m711/711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 59ms/step - accuracy: 0.9640 - loss: 0.0769
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step
Predicted Emotion: joy


In [None]:
# only run to export tokenizer
# import json
# tokenizer_json = tokenizer.to_json()  # Convert tokenizer to JSON
# with open("tokenizer.json", "w") as file:
#     json.dump(tokenizer_json, file)

In [9]:
from tensorflow.keras.models import load_model
from TTS.api import TTS

# Load TTS model
model = load_model('C:/LexiPal - AI/emotion_detection_model.keras')
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

# Ensure the tokenizer is properly loaded/trained
# Example: tokenizer = ... (load or define your tokenizer here)

# Ensure the output folder exists
import os
if not os.path.exists('static/audio'):
    os.makedirs('static/audio')

# Emotion prediction function
def predict_emotion(text):
    # Tokenizing and padding text
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=100)
    
    # Predicting emotion using the trained model
    prediction = model.predict(padded)
    
    # Get the predicted class index (highest probability)
    predicted_class_index = np.argmax(prediction)

    # Reverse the label_dict to map index to emotion label
    emotion = [emotion for emotion, index in label_dict.items() if index == predicted_class_index][0]
    
    return emotion

# Generate speech based on detected emotion
def generate_speech(text, emotion, speaker_wav=None):
    try:
        emotion_text = f"{text}"
        output_file = f"static/audio/output_{emotion}.wav"

        if speaker_wav:
            tts.tts_to_file(text=emotion_text, speaker_wav=speaker_wav, file_path=output_file)
        else:
            tts.tts_to_file(text=emotion_text, file_path=output_file)

        print(f"Generated speech with emotion '{emotion}' saved as '{output_file}'.")
        return output_file
    except Exception as e:
        print(f"Error generating speech: {e}")
        return None

# Combine emotion detection and TTS
def text_to_emotional_speech(text, speaker_wav=None):
    # Step 1: Predict emotion
    detected_emotion = predict_emotion(text)
    print(f"Detected Emotion: {detected_emotion}")

    # Step 2: Generate speech with detected emotion
    output_file = generate_speech(text, detected_emotion, speaker_wav)
    return output_file

 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Generator Model: hifigan_generator
 > Discriminator Model: hifigan_discriminator
Removing weight norm...


In [10]:
text_to_emotional_speech("are you kidding me right now?!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
Detected Emotion: anger
 > Text splitted to sentences.
['are you kidding me right now?!']
 > Processing time: 1.5115046501159668
 > Real-time factor: 0.699711906597604
Generated speech with emotion 'anger' saved as 'static/audio/output_anger.wav'.


'static/audio/output_anger.wav'

In [11]:
text_to_emotional_speech("I am so happy today!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Detected Emotion: joy
 > Text splitted to sentences.
['I am so happy today!']
 > Processing time: 1.4029889106750488
 > Real-time factor: 0.6711190879986295
Generated speech with emotion 'joy' saved as 'static/audio/output_joy.wav'.


'static/audio/output_joy.wav'

In [12]:
text_to_emotional_speech("I am super sad")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Detected Emotion: sadness
 > Text splitted to sentences.
['I am super sad']
 > Processing time: 1.2874126434326172
 > Real-time factor: 0.675891637802124
Generated speech with emotion 'sadness' saved as 'static/audio/output_sadness.wav'.


'static/audio/output_sadness.wav'