In [None]:
# Import necessary libraries
import pandas as pd
import joblib
from tensorflow.keras.models import load_model
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the preprocessed test data
X_test = pd.read_csv('../data/processed/test_preprocessed.csv')

# Load the original test data to get the bug IDs and preprocess it for the LSTM model
test_data = pd.read_csv('../data/raw/bugs-test.csv')
tokenizer = Tokenizer(num_words=5000, lower=True, split=' ')
tokenizer.fit_on_texts(test_data['summary_clean'].values)
X_test_seq = tokenizer.texts_to_sequences(test_data['summary_clean'].values)
X_test_pad = pad_sequences(X_test_seq, maxlen=X_test_seq.shape[1])

# Load the trained models
log_reg = joblib.load('../models/log_reg_model.pkl')
ada = joblib.load('../models/ada_model.pkl')
model = load_model('../models/lstm_model.h5')

# Predict severity classes for the test data using each model
log_reg_pred = log_reg.predict(X_test)
ada_pred = ada.predict(X_test)
lstm_pred = model.predict(X_test_pad)
lstm_pred_labels = np.argmax(lstm_pred, axis=1)

# For this example, let's choose the best model based on prior evaluation (assume it's LSTM for the final prediction)
final_pred = lstm_pred_labels

# Create the submission dataframe
submission_df = pd.DataFrame({
    'bug_id': test_data['bug_id'],
    'severity': final_pred
})

# Save the submission file
submission_df.to_csv('../data/submission.csv', index=False)

print("Submission file generated successfully!")
