### Logistic Regression 

In [None]:
#ACCURACY 0.59
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
import joblib

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')

# Handle missing values
df.dropna(inplace=True)

# Use a sample of 2000 entries
df = df.sample(n=2000, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['text'])
y = df['emotion']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Define parameter grid for GridSearchCV
param_grid = {
    'estimator__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'estimator__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'estimator__penalty': ['l1', 'l2', 'elasticnet']
}

# Perform Grid Search with OneVsRestClassifier
grid_search = GridSearchCV(estimator=OneVsRestClassifier(lr_model), param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

# Evaluate on validation set
best_lr_model = grid_search.best_estimator_
y_pred = best_lr_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation set accuracy: {:.2f}".format(accuracy))

# Save the best model as an HDF5 file
joblib.dump(best_lr_model, 'best_logreg_model_v3.h5')
print("Model saved as best_logreg_model_v3.h5")

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')

# Handle missing values
df.dropna(inplace=True)

# Use a sample of 2000 entries
df = df.sample(n=1000, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['text'])
y = df['emotion']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Define parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100,200, 300, 600, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['l1', 'l2', 'elasticnet']
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=lr_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

# Evaluate on validation set
best_lr_model = grid_search.best_estimator_
y_pred = best_lr_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation set accuracy: {:.2f}".format(accuracy))

# Save the best model as an HDF5 file
import joblib
joblib.dump(best_lr_model, 'best_logreg_model_v0.h5')
print("Model saved as best_logreg_model.h5")



### SVMs

In [None]:
#ACCURACY 0.67
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')

# Handle missing values
df.dropna(inplace=True)

# Use a sample of 6000 entries
df = df.sample(n=6000, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['text'])
y = df['emotion']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

# Initialize SVM model
svm = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Initialize SVM model with best parameters
best_svm_model = SVC(**grid_search.best_params_)

# Train the model on the full training data
best_svm_model.fit(X_train, y_train)

# Save SVM model using joblib
joblib.dump(best_svm_model, 'svm_model.pkl')

# Optionally, you can also evaluate the model on the validation set
y_pred = best_svm_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

# You can also print other evaluation metrics like classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

#ACCURACY 0.66
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib
from gensim.models import Word2Vec
import numpy as np

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')
df = df.sample(n=6000, random_state=42)  # Sample size for demonstration, adjust as needed
df.dropna(inplace=True)  # Handle missing values

# Load pre-trained Word2Vec model
word2vec_model = Word2Vec.load('word2vec_model_v1.bin')  # Replace with your actual file name

# Example function to calculate cosine similarity between emotions
def cosine_similarity(emotion1, emotion2):
    vec1 = np.mean([word2vec_model.wv[word] for word in emotion1.split() if word in word2vec_model.wv], axis=0)
    vec2 = np.mean([word2vec_model.wv[word] for word in emotion2.split() if word in word2vec_model.wv], axis=0)
    if vec1 is not None and vec2 is not None:
        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        return cosine_sim
    else:
        return 0.0  # Default similarity score when one or both words are not in the vocabulary

# Example: Automatically merge classes based on cosine similarity threshold
similarity_threshold = 0.7
emotion_mapping = {}

# Identify emotions appearing fewer than 3 times
rare_emotions = df['emotion'].value_counts()[df['emotion'].value_counts() < 3].index.tolist()

# Initialize emotion mapping dictionary
emotion_mapping = {}

# Loop through rare emotions
for emotion in rare_emotions:
    max_similarity = -1.0
    most_similar_emotion = None
    
    # Find the most similar and more frequent emotion
    for other_emotion in df['emotion'].unique():
        if other_emotion not in rare_emotions:
            similarity = cosine_similarity(emotion, other_emotion)
            if isinstance(similarity, (int, float)) and similarity > similarity_threshold and similarity > max_similarity:
                max_similarity = similarity
                most_similar_emotion = other_emotion
    
    # Map rare emotion to the most similar emotion found
    if most_similar_emotion:
        emotion_mapping[emotion] = most_similar_emotion

# Merge rare classes into more similar and frequent classes
df['emotion'] = df['emotion'].map(emotion_mapping).fillna(df['emotion'])

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000)
X = vectorizer.fit_transform(df['text'])
y = df['emotion']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

# Initialize SVM model
svm = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Initialize SVM model with best parameters
best_svm_model = SVC(**grid_search.best_params_)

# Train the model on the full training data
best_svm_model.fit(X_train, y_train)

# Save SVM model using joblib
joblib.dump(best_svm_model, 'svm_model_v1.pkl')

# Optionally, evaluate the model on the validation set
y_pred = best_svm_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

# Print classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))


### LSTMs

In [None]:
#ACCURACY 0.50
# import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')

# Handle missing values
df.dropna(inplace=True)

# Use a sample of 2000 entries
df = df.sample(n=10000, random_state=42)

# Data preprocessing
# Tokenization and padding
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Encode emotions
label_encoder = LabelEncoder()
df['emotion_encoded'] = label_encoder.fit_transform(df['emotion'])
y = df['emotion_encoded']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define LSTM model
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_dim,
                    input_length=100))
model.add(LSTM(128))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Adjust for multi-class classification

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train LSTM model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))

# Evaluate LSTM model
y_pred_probs = model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Save LSTM model
model.save('lstm_model_multiclass_sample6000_nwords_100k.h5')
print("Model saved as lstm_model_multiclass.h5")

#ACCURACY 0.75 
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Function to extend LabelEncoder dynamically
def extend_label_encoder(label_encoder, new_labels):
    existing_labels = set(label_encoder.classes_)
    for label in new_labels:
        if label not in existing_labels:
            label_encoder.classes_ = np.append(label_encoder.classes_, label)
            existing_labels.add(label)
    return label_encoder

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')
# Use a sample of 100,000 entries
df = df.sample(n=100000, random_state=42)
# Handle missing values
df.dropna(inplace=True)

# Data preprocessing
# Tokenization and padding
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define LSTM model
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_dim,
                    input_length=100))
model.add(LSTM(128))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Adjust for multi-class classification

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train LSTM model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=30, batch_size=60, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate LSTM model
y_pred_probs = model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels

# Print evaluation metrics
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=class_names, labels=unique_labels))

# Save LSTM model to a specified file
model_file = 'LSTM_models\samp100k_wor200k.h5'
model.save(model_file)
print(f"Model saved as {model_file}")

# Example of extending LabelEncoder dynamically
# Suppose during inference, new labels are encountered
new_labels = ['new_label1', 'new_label2']
label_encoder = extend_label_encoder(label_encoder, new_labels)

# Now label_encoder can handle new labels during prediction

from sklearn.metrics import classification_report, confusion_matrix

# Evaluate LSTM model
y_pred = np.argmax(model.predict(X_val), axis=1)
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Print classification report
class_names = label_encoder.inverse_transform(np.unique(y_val))
print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=class_names))

#ACCURACY 0.79 
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from gensim.models import KeyedVectors
from collections import Counter

# Load pre-trained Word2Vec model
word2vec_path = 'word2vec_model_v1.bin'  # Update with your path
word2vec = KeyedVectors.load(word2vec_path)

# Function to extend LabelEncoder dynamically
def extend_label_encoder(label_encoder, new_labels):
    existing_labels = set(label_encoder.classes_)
    for label in new_labels:
        if label not in existing_labels:
            label_encoder.classes_ = np.append(label_encoder.classes_, label)
            existing_labels.add(label)
    return label_encoder

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')
# Use a sample of 100,000 entries
df = df.sample(n=100000, random_state=42)
# Handle missing values
df.dropna(inplace=True)


def cosine_similarity(emotion1, emotion2):
    vec1 = np.mean([word2vec.wv[word] for word in emotion1.split() if word in word2vec.wv], axis=0)
    vec2 = np.mean([word2vec.wv[word] for word in emotion2.split() if word in word2vec.wv], axis=0)
    if vec1 is not None and vec2 is not None:
        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        return cosine_sim
    else:
        return 0.0  # Default similarity score when one or both words are not in the vocabulary


# Merge rare emotions based on cosine similarity
similarity_threshold = 0.7
rare_emotions = df['emotion'].value_counts()[df['emotion'].value_counts() < 3].index.tolist()
emotion_mapping = {}

for emotion in rare_emotions:
    max_similarity = -1.0
    most_similar_emotion = None
    
    for other_emotion in df['emotion'].unique():
        if other_emotion not in rare_emotions:
            similarity = cosine_similarity(emotion, other_emotion)
            if isinstance(similarity, (int, float)) and similarity > similarity_threshold and similarity > max_similarity:
                max_similarity = similarity
                most_similar_emotion = other_emotion
    
    if most_similar_emotion:
        emotion_mapping[emotion] = most_similar_emotion

df['emotion'] = df['emotion'].map(emotion_mapping).fillna(df['emotion'])

# Data preprocessing
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Check class distribution
class_distribution = Counter(y)
print("Class distribution before resampling:", class_distribution)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare embedding matrix
embedding_dim = 100
word_index = tokenizer.word_index
num_words = min(len(word_index) + 1, 200000)  # Limit to top 200,000 words

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in word2vec.wv.key_to_index:
        embedding_matrix[i] = word2vec.wv[word]
    # Note: if the word is not found in the Word2Vec model, embedding_matrix[i] will remain as zeros

# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=100,
                    trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train LSTM model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=30, batch_size=60, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate LSTM model
y_pred_probs = model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels

# Save LSTM model to a specified file
model_file = 'LSTM_models/samp100k_wor200k_w2v_v2.h5'
model.save(model_file)
print(f"Model saved as {model_file}")

# Example of extending LabelEncoder dynamically
# Suppose during inference, new labels are encountered
new_labels = ['new_label1', 'new_label2']
label_encoder = extend_label_encoder(label_encoder, new_labels)

# Get list of emotion classes
emotion_classes = label_encoder.classes_
# Define the file path to save the classes
output_file = 'WebApp\emotion_classes_s100k.txt'
# Write emotion classes to a text file
with open(output_file, 'w') as file:
    for emotion_class in emotion_classes:
        file.write(emotion_class + '\n')

import pickle
# Save Tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


import matplotlib.pyplot as plt

# Plot training history
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Ensemble methods can be explored by combining multiple trained models, such as through averaging predictions or using a VotingClassifier approach in scikit-learn.
# For simplicity, here's an example of averaging predictions from two different models:

# Example of ensemble averaging predictions
# Load two different saved models
model_file_1 = 'LSTM_models/samp100k_wor200k_w2v_best.h5'
model_file_2 = 'LSTM_models\samp100k_wor200k_w2v.h5'

model_1 = tf.keras.models.load_model(model_file_1)
model_2 = tf.keras.models.load_model(model_file_2)

# Get predictions from each model
y_pred_probs_1 = model_1.predict(X_val)
y_pred_probs_2 = model_2.predict(X_val)

# Average predictions
y_pred_avg = (y_pred_probs_1 + y_pred_probs_2) / 2
y_pred_avg_classes = np.argmax(y_pred_avg, axis=1)

# Evaluate ensemble model
accuracy_avg = accuracy_score(y_val, y_pred_avg_classes)
print(f"Ensemble Model Accuracy: {accuracy_avg}")
print("Ensemble Model Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_avg_classes))
print("Ensemble Model Classification Report:")
print(classification_report(y_val, y_pred_avg_classes, target_names=label_encoder.classes_))
#ACCURACY VAL 0.74
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.metrics import accuracy_score
from collections import Counter
from gensim.models import KeyedVectors

# Load pre-trained Word2Vec model
word2vec_path = 'word2vec_model_v1.bin'  # Update with your path
word2vec = KeyedVectors.load(word2vec_path)

# Function to calculate cosine similarity between emotions
def cosine_similarity(emotion1, emotion2):
    vec1 = np.mean([word2vec.wv[word] for word in emotion1.split() if word in word2vec.wv], axis=0)
    vec2 = np.mean([word2vec.wv[word] for word in emotion2.split() if word in word2vec.wv], axis=0)
    if vec1 is not None and vec2 is not None:
        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        return cosine_sim
    else:
        return 0.0  # Default similarity score when one or both words are not in the vocabulary

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')

# Sample 100,000 entries
df = df.sample(n=10000, random_state=42)

# Handle missing values
df.dropna(inplace=True)

# Merge rare emotions based on cosine similarity
similarity_threshold = 0.7
rare_emotions = df['emotion'].value_counts()[df['emotion'].value_counts() < 3].index.tolist()
emotion_mapping = {}

for emotion in rare_emotions:
    max_similarity = -1.0
    most_similar_emotion = None
    
    for other_emotion in df['emotion'].unique():
        if other_emotion not in rare_emotions:
            similarity = cosine_similarity(emotion, other_emotion)
            if isinstance(similarity, (int, float)) and similarity > similarity_threshold and similarity > max_similarity:
                max_similarity = similarity
                most_similar_emotion = other_emotion
    
    if most_similar_emotion:
        emotion_mapping[emotion] = most_similar_emotion

df['emotion'] = df['emotion'].map(emotion_mapping).fillna(df['emotion'])

# Data preprocessing
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Check class distribution
class_distribution = Counter(y)
print("Class distribution before resampling:", class_distribution)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare embedding matrix
embedding_dim = 100
word_index = tokenizer.word_index
num_words = min(len(word_index) + 1, 200000)  # Limit to top 200,000 words

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in word2vec.wv.key_to_index:
        embedding_matrix[i] = word2vec.wv[word]
    # Note: if the word is not found in the Word2Vec model, embedding_matrix[i] will remain as zeros

# Define the Keras LSTM model function
def create_lstm_model(embedding_dim=100, num_classes=len(label_encoder.classes_), max_sequence_length=100, word_index=None, embedding_matrix=None):
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index) + 1,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=max_sequence_length,
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(num_classes, activation='softmax', name='output_layer'))
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Manual hyperparameter tuning
batch_sizes = [64]
epochs_values = [60,70]

for batch_size in batch_sizes:
    for epochs in epochs_values:
        print(f"Training with batch size: {batch_size}, epochs: {epochs}")
        
        # Create a new instance of the model for each iteration
        model = create_lstm_model(embedding_dim=embedding_dim, num_classes=len(label_encoder.classes_), max_sequence_length=100, word_index=word_index, embedding_matrix=embedding_matrix)
        
        # Train the model
        history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), verbose=1)
        
        # Evaluate the model
        loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
        print(f"Validation accuracy: {accuracy}\n")


final_model = create_lstm_model(embedding_dim=embedding_dim, num_classes=len(label_encoder.classes_), max_sequence_length=100, word_index=word_index, embedding_matrix=embedding_matrix)
final_model.fit(X_train, y_train, epochs=60, batch_size=64, validation_data=(X_val, y_val), verbose=1)

# Save your final model
final_model.save('LSTM_model/lstm_model_s10k.h5')
print("Final model saved.")
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from gensim.models import KeyedVectors
from collections import Counter
from scikeras.wrappers import KerasClassifier

# Load pre-trained Word2Vec model
word2vec_path = 'word2vec_model_v1.bin'  # Update with your path
word2vec = KeyedVectors.load(word2vec_path)

# Function to extend LabelEncoder dynamically
def extend_label_encoder(label_encoder, new_labels):
    existing_labels = set(label_encoder.classes_)
    for label in new_labels:
        if label not in existing_labels:
            label_encoder.classes_ = np.append(label_encoder.classes_, label)
            existing_labels.add(label)
    return label_encoder

# Load dataset
df = pd.read_csv('data/combined_emotions.csv')
# Use a sample of 100,000 entries
df = df.sample(n=100000, random_state=42)
# Handle missing values
df.dropna(inplace=True)

def cosine_similarity(emotion1, emotion2):
    vec1 = np.mean([word2vec.wv[word] for word in emotion1.split() if word in word2vec.wv], axis=0)
    vec2 = np.mean([word2vec.wv[word] for word in emotion2.split() if word in word2vec.wv], axis=0)
    if vec1 is not None and vec2 is not None:
        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        return cosine_sim
    else:
        return 0.0  # Default similarity score when one or both words are not in the vocabulary

# Merge rare emotions based on cosine similarity
similarity_threshold = 0.7
rare_emotions = df['emotion'].value_counts()[df['emotion'].value_counts() < 3].index.tolist()
emotion_mapping = {}

for emotion in rare_emotions:
    max_similarity = -1.0
    most_similar_emotion = None
    
    for other_emotion in df['emotion'].unique():
        if other_emotion not in rare_emotions:
            similarity = cosine_similarity(emotion, other_emotion)
            if isinstance(similarity, (int, float)) and similarity > similarity_threshold and similarity > max_similarity:
                max_similarity = similarity
                most_similar_emotion = other_emotion
    
    if most_similar_emotion:
        emotion_mapping[emotion] = most_similar_emotion

df['emotion'] = df['emotion'].map(emotion_mapping).fillna(df['emotion'])

# Data preprocessing
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Check class distribution
class_distribution = Counter(y)
print("Class distribution before resampling:", class_distribution)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare embedding matrix
embedding_dim = 100
word_index = tokenizer.word_index
num_words = min(len(word_index) + 1, 200000)  # Limit to top 200,000 words

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in word2vec.wv.key_to_index:
        embedding_matrix[i] = word2vec.wv[word]
    # Note: if the word is not found in the Word2Vec model, embedding_matrix[i] will remain as zeros

# Define LSTM model function for GridSearchCV
def create_lstm_model(optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index) + 1,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=100,
                        input_shape=(1,),
                        trainable=False))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))

    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define parameter grid for GridSearchCV
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [32, 64],
    'epochs': [20, 30]
}

# Setup GridSearchCV
keras_lstm_model = KerasClassifier(build_fn=create_lstm_model, verbose=0)
grid_search = GridSearchCV(estimator=keras_lstm_model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3,
                           verbose=2,
                           n_jobs=-1)

# Perform GridSearchCV
grid_search_results = grid_search.fit(X_train, y_train)

# Evaluate best model on validation set
best_model = grid_search_results.best_estimator_
y_pred_probs = best_model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels

accuracy = accuracy_score(y_val, y_pred)
print(f"Best Model Accuracy on Validation Set: {accuracy}")

# Save best model to a specified file
#model_file = 'LSTM_models/best_model.h5'
#best_model.model.save(model_file)
#print(f"Best model saved as {model_file}")

# Example of extending LabelEncoder dynamically
# Suppose during inference, new labels are encountered
new_labels = ['new_label1', 'new_label2']
label_encoder = extend_label_encoder(label_encoder, new_labels)


import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from gensim.models import KeyedVectors
from collections import Counter
import pickle


# Load pre-trained Word2Vec model
word2vec_path = 'WebApp\word2vec_model_v1.bin'  # Update with your path
word2vec_model = KeyedVectors.load(word2vec_path)

# Load dataset (replace with your actual dataset path)
df = pd.read_csv('data/combined_emotions.csv')
# Use a sample of 100,000 entries
df = df.sample(n=100000, random_state=42)
# Load the new list of emotions
# Handle missing values
df.dropna(inplace=True)

new_emotions_file = 'WebApp\emotion_classes_s100k.txt'

def cosine_similarity(emotion1, emotion2, word2vec_model):
    vec1 = np.mean([word2vec_model.wv[word] for word in emotion1.split() if word in word2vec_model.wv], axis=0)
    vec2 = np.mean([word2vec_model.wv[word] for word in emotion2.split() if word in word2vec_model.wv], axis=0)
    
    if vec1 is not None and vec2 is not None:
        cosine_sim = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
        return float(cosine_sim) if not isinstance(cosine_sim, np.ndarray) else float(cosine_sim[0])
    else:
        return -1.0  # Return a value that indicates no similarity when vectors are not valid


similarity_threshold = 0.8
rare_emotions = df['emotion'].value_counts()[df['emotion'].value_counts() < 3].index.tolist()
emotion_mapping = {}

for emotion in rare_emotions:
    max_similarity = -1.0
    most_similar_emotion = None
    
    for other_emotion in df['emotion'].unique():
        if other_emotion not in rare_emotions and emotion != other_emotion:
            similarity = cosine_similarity(emotion, other_emotion, word2vec_model)
            if similarity != -1.0 and similarity > similarity_threshold and similarity > max_similarity:
                max_similarity = similarity
                most_similar_emotion = other_emotion
    
    if most_similar_emotion is not None:
        emotion_mapping[emotion] = most_similar_emotion

df['emotion'] = df['emotion'].map(emotion_mapping).fillna(df['emotion'])
)



# Data preprocessing
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Check class distribution
class_distribution = np.bincount(y)
print("Class distribution before resampling:", class_distribution)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare embedding matrix
embedding_dim = 100
word_index = tokenizer.word_index
num_words = min(len(word_index) + 1, 200000)  # Limit to top 200,000 words

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in word2vec.wv.key_to_index:
        embedding_matrix[i] = word2vec.wv[word]
    # Note: if the word is not found in the Word2Vec model, embedding_matrix[i] will remain as zeros

# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=100,
                    trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train LSTM model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=30, batch_size=60, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate LSTM model
y_pred_probs = model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)  # Convert probabilities to class labels

# Save LSTM model to a specified file
model_file = 'LSTM_models/lstm_retrained_list_v2.h5'
model.save(model_file)
print(f"Model saved as {model_file}")


# Example of extending LabelEncoder dynamically
new_labels = ['new_label1', 'new_label2']
label_encoder.classes_ = np.append(label_encoder.classes_, new_labels)


# Save Tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save Label Encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)


In [None]:
#LSTM_models/bilstm_emotion_classifier.h5
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from gensim.models import KeyedVectors
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
import pickle

# Load dataset (replace with your actual dataset path)
df = pd.read_csv('data/combined_emotions.csv')
df = df.sample(n=10000, random_state=42)
df.dropna(inplace=True)

# Data preprocessing
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=100)

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Check class distribution
class_distribution = np.bincount(y)
print("Class distribution before resampling:", class_distribution)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare embedding matrix using pre-trained Word2Vec embeddings
# Assuming word2vec_model is loaded and processed as in your initial code
word2vec_path = 'WebApp/word2vec_model_v1.bin'  # Update with your path
word2vec_model = KeyedVectors.load(word2vec_path)

embedding_dim = 100
word_index = tokenizer.word_index
num_words = min(len(word_index) + 1, 200000)  # Limit to top 200,000 words

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= num_words:
        continue
    if word in word2vec_model.wv.key_to_index:
        embedding_matrix[i] = word2vec_model.wv[word]

# Define BiLSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=num_words,
                         output_dim=embedding_dim,
                         weights=[embedding_matrix],
                         input_length=100,
                         trainable=False))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model_lstm.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile model
model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train BiLSTM model with early stopping
early_stopping_lstm = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history_lstm = model_lstm.fit(X_train, y_train, epochs=50, batch_size=60, validation_data=(X_val, y_val), callbacks=[early_stopping_lstm])


# Save BiLSTM model
model_file_lstm = 'LSTM_models/bilstm_emotion_classifier.h5'
model_lstm.save(model_file_lstm)
print(f"BiLSTM Model saved as {model_file_lstm}")

# Save Tokenizer and Label Encoder
with open('tokenizer_bilstm.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder_bilstm.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Evaluate BiLSTM model
y_pred_probs_lstm = model_lstm.predict(X_val)
y_pred_lstm = np.argmax(y_pred_probs_lstm, axis=1)

# Print evaluation metrics
accuracy_lstm = accuracy_score(y_val, y_pred_lstm)
print(f"Accuracy: {accuracy_lstm}")
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred_lstm))
print("Classification Report:")
print(classification_report(y_val, y_pred_lstm, target_names=label_encoder.classes_))


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pickle

# Load dataset (replace with your actual dataset path)
df = pd.read_csv('data/combined_emotions.csv')
df = df.sample(n=5000, random_state=42)
df.dropna(inplace=True)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
max_len = 80  # Adjust according to your text length
X = df['text'].astype(str).tolist()
tokens = tokenizer(X, truncation=True, padding='max_length', max_length=max_len, return_tensors='tf')

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Convert tensors to numpy arrays
input_ids = tokens['input_ids'].numpy()
attention_mask = tokens['attention_mask'].numpy()

# Split data into training and validation sets
X_train_ids, X_val_ids, X_train_mask, X_val_mask, y_train, y_val = train_test_split(
    input_ids, 
    attention_mask, 
    y, 
    test_size=0.2, 
    random_state=42
)

# BERT model architecture
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
bert_output = bert_model(input_ids, attention_mask=attention_mask)[1]  # Taking pooled output as representation
dropout = Dropout(0.1)(bert_output)
output = Dense(len(label_encoder.classes_), activation='softmax')(dropout)
model_bert = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile model
optimizer = Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0)
model_bert.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train BERT model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history_bert = model_bert.fit(
    {'input_ids': X_train_ids, 'attention_mask': X_train_mask}, 
    y_train, 
    epochs=25, 
    batch_size=32, 
    validation_data=({'input_ids': X_val_ids, 'attention_mask': X_val_mask}, y_val), 
    callbacks=[early_stopping]
)

# Evaluate BERT model
y_pred_probs_bert = model_bert.predict({'input_ids': X_val_ids, 'attention_mask': X_val_mask})
y_pred_bert = np.argmax(y_pred_probs_bert, axis=1)

# Save BERT model
model_file_bert = 'BERT_models/bert_emotion_classifier_v1.keras'
model_bert.save(model_file_bert)
print(f"BERT Model saved as {model_file_bert}")

# Save Tokenizer and Label Encoder
with open('tokenizer_bert_v1.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder_bert_v1.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
import os
# Calculate accuracy and classification report
accuracy = accuracy_score(y_val, y_pred_bert)
labels = label_encoder.classes_
classification_rep = classification_report(y_val, y_pred_bert, target_names=labels)

# Save accuracy and classification report to a text file
save_path = 'BERT_models'
if not os.path.exists(save_path):
    os.makedirs(save_path)

report_path = os.path.join(save_path, 'bert_report.txt')
with open(report_path, 'w') as f:
    f.write(f"BERT Accuracy: {accuracy}\n\n")
    f.write("BERT Classification Report:\n")
    f.write(classification_rep)

print(f"BERT report saved to {report_path}")


### BERT

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense, Dropout, Bidirectional, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# Load dataset (replace with your actual dataset path)
df = pd.read_csv('data/combined_emotions.csv')
df = df.sample(n=10000, random_state=42)
df.dropna(inplace=True)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
max_len = 100  # Adjust according to your text length
X = df['text'].astype(str).tolist()
X = tokenizer(X, truncation=True, padding=True, max_length=max_len, return_tensors='tf')

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X.input_ids.numpy(), y, test_size=0.2, random_state=42)

# BERT model architecture
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
bert_output = bert_model(input_ids)[1]  # Taking pooled output as representation
dropout = Dropout(0.1)(bert_output)
output = Dense(len(label_encoder.classes_), activation='softmax')(dropout)
model_bert = Model(inputs=input_ids, outputs=output)

# Compile model
optimizer = Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0)
model_bert.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train BERT model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history_bert = model_bert.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping])

# Evaluate BERT model
y_pred_probs_bert = model_bert.predict(X_val)
y_pred_bert = np.argmax(y_pred_probs_bert, axis=1)

# Save BERT model
model_file_bert = 'BERT_models/bert_emotion_classifier.h5'
model_bert.save(model_file_bert)
print(f"BERT Model saved as {model_file_bert}")

# Save Tokenizer and Label Encoder
with open('tokenizer_bert.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder_bert.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)



import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import pickle

# Load dataset (replace with your actual dataset path)
df = pd.read_csv('data/combined_emotions.csv')
df = df.sample(n=5000, random_state=42)
df.dropna(inplace=True)

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts
max_len = 80  # Adjust according to your text length
X = df['text'].astype(str).tolist()
tokens = tokenizer(X, truncation=True, padding='max_length', max_length=max_len, return_tensors='tf')

# Encode emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotion'])

# Convert tensors to numpy arrays
input_ids = tokens['input_ids'].numpy()
attention_mask = tokens['attention_mask'].numpy()

# Split data into training and validation sets
X_train_ids, X_val_ids, X_train_mask, X_val_mask, y_train, y_val = train_test_split(
    input_ids, 
    attention_mask, 
    y, 
    test_size=0.2, 
    random_state=42
)

# BERT model architecture
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
bert_output = bert_model(input_ids, attention_mask=attention_mask)[1]  # Taking pooled output as representation
dropout = Dropout(0.1)(bert_output)
output = Dense(len(label_encoder.classes_), activation='softmax')(dropout)
model_bert = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile model
optimizer = Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0)
model_bert.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train BERT model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history_bert = model_bert.fit(
    {'input_ids': X_train_ids, 'attention_mask': X_train_mask}, 
    y_train, 
    epochs=25, 
    batch_size=32, 
    validation_data=({'input_ids': X_val_ids, 'attention_mask': X_val_mask}, y_val), 
    callbacks=[early_stopping]
)

# Evaluate BERT model
y_pred_probs_bert = model_bert.predict({'input_ids': X_val_ids, 'attention_mask': X_val_mask})
y_pred_bert = np.argmax(y_pred_probs_bert, axis=1)

# Save BERT model
model_file_bert = 'BERT_models/bert_emotion_classifier_v1.keras'
model_bert.save(model_file_bert)
print(f"BERT Model saved as {model_file_bert}")

# Save Tokenizer and Label Encoder
with open('tokenizer_bert_v1.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder_bert_v1.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

import os
# Calculate accuracy and classification report
accuracy = accuracy_score(y_val, y_pred_bert)
labels = label_encoder.classes_
classification_rep = classification_report(y_val, y_pred_bert, target_names=labels)
