In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# Load the preprocessed training data
X_train = pd.read_csv('../data/processed/train_preprocessed.csv')
y_train = pd.read_csv('../data/processed/train_cleaned.csv')['severity']

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Approach 1: Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_split, y_train_split)
log_reg_pred = log_reg.predict(X_val_split)
log_reg_report = classification_report(y_val_split, log_reg_pred)
print("Logistic Regression Report:\n", log_reg_report)

# Save the logistic regression model
joblib.dump(log_reg, '../models/log_reg_model.pkl')

# Approach 2: AdaBoost
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train_split, y_train_split)
ada_pred = ada.predict(X_val_split)
ada_report = classification_report(y_val_split, ada_pred)
print("AdaBoost Report:\n", ada_report)

# Save the AdaBoost model
joblib.dump(ada, '../models/ada_model.pkl')

# Approach 3: LSTM
# Load the original training data to reprocess it for the LSTM model
train_data = pd.read_csv('../data/processed/train_cleaned.csv')

# Fill NaN values with empty strings
train_data['summary_clean'] = train_data['summary_clean'].fillna('')

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, lower=True, split=' ')
tokenizer.fit_on_texts(train_data['summary_clean'].values)
X = tokenizer.texts_to_sequences(train_data['summary_clean'].values)
X = pad_sequences(X)

# Convert labels to one-hot encoding
y = pd.get_dummies(train_data['severity']).values

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val), verbose=2)

# Save the LSTM model
model.save('../models/lstm_model.h5')

# Evaluate the LSTM model
lstm_pred = model.predict(X_val)
lstm_pred_labels = np.argmax(lstm_pred, axis=1)
y_val_labels = np.argmax(y_val, axis=1)
lstm_report = classification_report(y_val_labels, lstm_pred_labels)
print("LSTM Report:\n", lstm_report)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression Report:
               precision    recall  f1-score   support

     blocker       0.00      0.00      0.00       143
    critical       0.80      0.67      0.73      3663
 enhancement       1.00      0.01      0.02       852
       major       0.53      0.02      0.04      1201
       minor       0.00      0.00      0.00       593
      normal       0.86      0.98      0.91     25320
     trivial       0.00      0.00      0.00       228

    accuracy                           0.85     32000
   macro avg       0.46      0.24      0.24     32000
weighted avg       0.82      0.85      0.81     32000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


AdaBoost Report:
               precision    recall  f1-score   support

     blocker       0.10      0.01      0.01       143
    critical       0.74      0.76      0.75      3663
 enhancement       0.06      0.00      0.00       852
       major       0.39      0.02      0.03      1201
       minor       0.00      0.00      0.00       593
      normal       0.87      0.97      0.92     25320
     trivial       0.00      0.00      0.00       228

    accuracy                           0.85     32000
   macro avg       0.31      0.25      0.25     32000
weighted avg       0.79      0.85      0.81     32000

Epoch 1/5




2000/2000 - 25s - 13ms/step - accuracy: 0.8475 - loss: 0.5898 - val_accuracy: 0.8601 - val_loss: 0.5259
Epoch 2/5
2000/2000 - 23s - 11ms/step - accuracy: 0.8586 - loss: 0.5104 - val_accuracy: 0.8626 - val_loss: 0.5107
Epoch 3/5
2000/2000 - 23s - 12ms/step - accuracy: 0.8621 - loss: 0.4784 - val_accuracy: 0.8622 - val_loss: 0.5132
Epoch 4/5
2000/2000 - 23s - 12ms/step - accuracy: 0.8652 - loss: 0.4551 - val_accuracy: 0.8621 - val_loss: 0.5242
Epoch 5/5
2000/2000 - 23s - 12ms/step - accuracy: 0.8679 - loss: 0.4364 - val_accuracy: 0.8613 - val_loss: 0.5308




[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
LSTM Report:
               precision    recall  f1-score   support

           0       0.55      0.04      0.08       143
           1       0.79      0.79      0.79      3663
           2       0.39      0.07      0.12       852
           3       0.48      0.08      0.14      1201
           4       0.44      0.02      0.04       593
           5       0.88      0.97      0.92     25320
           6       0.32      0.03      0.05       228

    accuracy                           0.86     32000
   macro avg       0.55      0.29      0.30     32000
weighted avg       0.83      0.86      0.83     32000

