In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# Load the preprocessed training data
X_train = pd.read_csv('../data/processed/train_preprocessed.csv')
y_train = pd.read_csv('../data/raw/bugs-train.csv')['severity']

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Approach 1: Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_split, y_train_split)
log_reg_pred = log_reg.predict(X_val_split)
log_reg_report = classification_report(y_train_split, log_reg_pred)
print("Logistic Regression Report:\n", log_reg_report)

# Save the logistic regression model
joblib.dump(log_reg, '../models/log_reg_model.pkl')

# Approach 2: AdaBoost
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train_split, y_train_split)
ada_pred = ada.predict(X_val_split)
ada_report = classification_report(y_train_split, ada_pred)
print("AdaBoost Report:\n", ada_report)

# Save the AdaBoost model
joblib.dump(ada, '../models/ada_model.pkl')

# Approach 3: LSTM
# For LSTM, we need to reprocess the text data to keep it in sequential form

# Load the original training data
train_data = pd.read_csv('../data/raw/bugs-train.csv')

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, lower=True, split=' ')
tokenizer.fit_on_texts(train_data['summary_clean'].values)
X = tokenizer.texts_to_sequences(train_data['summary_clean'].values)
X = pad_sequences(X)

# Convert labels to one-hot encoding
y = pd.get_dummies(train_data['severity']).values

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val), verbose=2)

# Save the LSTM model
model.save('../models/lstm_model.h5')

# Evaluate the LSTM model
lstm_pred = model.predict(X_val)
lstm_pred_labels = np.argmax(lstm_pred, axis=1)
y_val_labels = np.argmax(y_val, axis=1)
lstm_report = classification_report(y_val_labels, lstm_pred_labels)
print("LSTM Report:\n", lstm_report)
