# Model

# With Pre-Trained Model Embeddings

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
import torch

# Load the dataset
df = pd.read_csv("<Dataset-path>") # Mention the Dataset path

# Preprocess the data
X = df['<Column-Header>'].values # Replace them with Column Headers 
y = df['<Column-Header>'].values

# Convert labels to numerical format
le = LabelEncoder()
y = le.fit_transform(y)

# Load precomputed BERT embeddings
bert_embeddings = torch.load("BERT_Embeddings.pt") # Mention the path for BERT Embeddings

# Initialize StratifiedKFold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
accuracy_list = []
f1_list = []

# Create figure for subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Accuracy', 'F1-Score'), shared_yaxes=True)

# Loop through cross-validation splits
for fold, (train_index, test_index) in enumerate(skf.split(X, y), 1):
    X_train, X_test = bert_embeddings[train_index], bert_embeddings[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Build the model
    model = Sequential()
    model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(max_length, bert_embeddings.shape[2])))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))


    # Compile the model with binary_crossentropy loss
    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Initialize lists to store metrics at each epoch
    epoch_list = []
    accuracy_epoch_list = []
    f1_epoch_list = []

    # Train the model
    history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.1, verbose=0)

    # Evaluate the model on the test set at each epoch
    for epoch in range(1, len(history.history['accuracy']) + 1):
        y_pred_prob = model.predict(X_test)
        y_pred = (y_pred_prob > 0.5).astype(int)

        # Calculate accuracy and F1-score
        accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
        f1 = f1_score(y_test, y_pred, average='macro')

        # Append results to lists
        epoch_list.append(epoch)
        accuracy_epoch_list.append(accuracy)
        f1_epoch_list.append(f1)

    # Print the best hyperparameters for the current fold
    print(f"Fold - {fold}")

    # Append fold results to lists
    accuracy_list.append(accuracy_epoch_list)
    f1_list.append(f1_epoch_list)

    # Add traces for each fold
    fig.add_trace(go.Scatter(x=epoch_list, y=accuracy_epoch_list, mode='lines+markers', name=f'Fold {fold} - Accuracy'))
    fig.add_trace(go.Scatter(x=epoch_list, y=f1_epoch_list, mode='lines+markers', name=f'Fold {fold} - F1-Score'))

# Update layout for better readability
fig.update_layout(title_text='Cross-Validation Metrics',
                  xaxis_title='Epoch',
                  yaxis_title='Metric Value',
                  template='plotly_dark')

# Show interactive graph
fig.show()

# Print final metrics and classification report
print("\nFinal Metrics:")
print(f"Loss: {model.evaluate(bert_embeddings, y)[0]}")
print(f"Accuracy: {model.evaluate(bert_embeddings, y)[1]}")
print(f"Validation Accuracy: {max([max(accuracy_list[i]) for i in range(len(accuracy_list))])}")

# Evaluate the model on the entire dataset
y_pred_prob = model.predict(bert_embeddings)
y_pred = (y_pred_prob > 0.5).astype(int)

# Convert back to original labels for classification report
y_original = le.inverse_transform(y)
y_pred_original = le.inverse_transform(y_pred)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_original, y_pred_original, target_names=le.classes_))

# Calculate and print the macro F1 score
macro_f1 = f1_score(y, y_pred, average='macro')
print("\nMacro F1 Score:", macro_f1)


# Saving the Model

# Replace model-name 
model.save("model-name.h5")