In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, GRU, LSTM, Dense, Dropout, GlobalMaxPooling1D,
    Bidirectional, BatchNormalization, LayerNormalization, SpatialDropout1D,
    MultiHeadAttention, Concatenate, Add, GlobalAveragePooling1D
)
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import pickle

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#preprocessing

In [None]:


# Load the dataset
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function for text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)
    return text

# Remove missing values and duplicates
df_train = df_train.dropna(subset=['Discussion'])
df_test = df_test.dropna(subset=['Discussion'])
df_train = df_train.drop_duplicates(subset=['Discussion', 'Category'])

# Define category labels
category_to_label = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}
df_train['Category'] = df_train['Category'].map(category_to_label)

# Apply preprocessing to the text data
df_train['Discussion'] = df_train['Discussion'].apply(preprocess_text)
df_test['Discussion'] = df_test['Discussion'].apply(preprocess_text)

# Remove rows with very short discussions
min_length = 5
df_train = df_train[df_train['Discussion'].str.len() > min_length]

In [None]:

# Tokenize using Keras tokenizer
max_len = 100
num_words = 20000

tokenizer = Tokenizer(num_words=num_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df_train['Discussion'])
word_index = tokenizer.word_index
# Save the tokenizer after fitting
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)
# Tokenize and pad the sequences
x_train_sequences = tokenizer.texts_to_sequences(df_train['Discussion'])
x_train_padded = pad_sequences(x_train_sequences, maxlen=max_len, padding='post', truncating='post')

x_test_sequences = tokenizer.texts_to_sequences(df_test['Discussion'])
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_len, padding='post', truncating='post')

# Encode labels
y_train = df_train['Category'].values

# Train-validation split
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(
    x_train_padded, y_train, test_size=0.2, stratify=y_train, random_state=45
)

In [None]:
# Load GloVe embeddings
embedding_dim = 200
glove_file = 'glove.6B.200d.txt'

# Create embedding matrix
embeddings_index = {}
with open(glove_file, encoding='utf-8') as f:
    for line in tqdm(f, desc="Loading GloVe embeddings"):
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            # Check if the embedding has the correct dimension
            if coefs.shape == (embedding_dim,):
                embeddings_index[word] = coefs
            else:
                print(f"Skipping word '{word}' due to incorrect embedding dimension: {coefs.shape}")
        except ValueError:
            print(f"Skipping word '{word}' due to value error in embedding")

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


Loading GloVe embeddings: 34726it [00:01, 19975.61it/s]


Skipping word 'beltre' due to incorrect embedding dimension: (96,)


In [None]:
def save_predictions_to_csv(predictions, output_directory=r'Predd'):
    os.makedirs(output_directory, exist_ok=True)
    sample_ids = range(1, len(predictions) + 1)
    results_df = pd.DataFrame({'SampleID': sample_ids, 'Category': predictions})
    output_file_path = os.path.join(output_directory, 'predictions.csv')
    results_df.to_csv(output_file_path, index=False)
    print(f"Predictions saved to: {output_file_path}")

In [None]:
# def train_save_model_and_predictions(model, model_name, output_directory='Output'):
#     os.makedirs(output_directory, exist_ok=True)

#     callbacks = [
#         EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
#         ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
#     ]

#     # Train the model
#     model.fit(
#         x_train_split, y_train_split,
#         validation_data=(x_val_split, y_val_split),
#         epochs=30,
#         batch_size=32,
#         callbacks=callbacks,
#         verbose=1
#     )

#     # Save the trained model
#     model_path = os.path.join(output_directory, f"{model_name}_model.h5")
#     model.save(model_path)
#     print(f"Model saved to: {model_path}")

#     # Generate predictions
#     predictions = model.predict(x_test_padded, verbose=0)
#     predictions_labels = np.argmax(predictions, axis=1)

#     # Define label_to_category mapping (inverse of category_to_label)
#     label_to_category = {v: k for k, v in category_to_label.items()}

#     # Validate predictions and map to categories
#     df_test['Category'] = pd.Series(predictions_labels).map(label_to_category)
#     if df_test['Category'].isnull().any():
#         print("Warning: Some predictions were not mapped to valid categories.")
#         df_test['Category'].fillna('Unknown', inplace=True)

#     # Use save_predictions_to_csv function
#     save_predictions_to_csv(predictions_labels, output_directory)


# def train_save_model_and_predictions(model, model_name, output_directory='Output'):
#     # Create a unique subdirectory for the model
#     # Create a unique subdirectory for the model
#     model_directory = os.path.join(output_directory, model_name)
#     os.makedirs(model_directory, exist_ok=True)

#     # Callbacks for training
#     callbacks = [
#         EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
#         ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6),
#         ModelCheckpoint(
#             # Change the filepath to end with .keras
#             filepath=os.path.join(model_directory, f"{model_name}_best_model.keras"),
#             monitor='val_loss',
#             save_best_only=True
#         )
#     ]

#     # Train the model
#     model.fit(
#         x_train_split, y_train_split,
#         validation_data=(x_val_split, y_val_split),
#         epochs=30,
#         batch_size=32,
#         callbacks=callbacks,
#         verbose=1
#     )

#     # Save the final model
#     final_model_path = os.path.join(model_directory, f"{model_name}_final_model.h5")
#     model.save(final_model_path)
#     print(f"Final model saved to: {final_model_path}")

#     # Generate predictions
#     predictions = model.predict(x_test_padded, verbose=0)
#     predictions_labels = np.argmax(predictions, axis=1)

#     # Define label_to_category mapping (inverse of category_to_label)
#     label_to_category = {v: k for k, v in category_to_label.items()}

#     # Validate predictions and map to categories
#     df_test['Category'] = pd.Series(predictions_labels).map(label_to_category)
#     if df_test['Category'].isnull().any():
#         print("Warning: Some predictions were not mapped to valid categories.")
#         df_test['Category'].fillna('Unknown', inplace=True)
# # Ensure 'Id' column exists, if not, create it
#     if 'Id' not in df_test.columns:
#         df_test['Id'] = df_test.index + 1  # Assuming Id starts from 1
#     # Save predictions to a CSV file
#     predictions_path = os.path.join(model_directory, f"{model_name}_predictions.csv")
#     df_test[['Id', 'Category']].to_csv(predictions_path, index=False)
#     print(f"Predictions saved to: {predictions_path}")


category_to_label = {
    'Politics': 0,
    'Sports': 1,
    'Media': 2,
    'Market & Economy': 3,
    'STEM': 4
}

def train_save_model_and_predictions(model, model_name, output_directory='Output'):
    # Create a unique subdirectory for the model
    model_directory = os.path.join(output_directory, model_name)
    os.makedirs(model_directory, exist_ok=True)

    # Callbacks for training
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6),
        ModelCheckpoint(
            filepath=os.path.join(model_directory, f"{model_name}_best_model.keras"),
            monitor='val_loss',
            save_best_only=True
        )
    ]

    # Train the model
    model.fit(
        x_train_split, y_train_split,
        validation_data=(x_val_split, y_val_split),
        epochs=30,
        batch_size=32,
        callbacks=callbacks,
        verbose=1
    )

    # Save the final model
    final_model_path = os.path.join(model_directory, f"{model_name}_final_model.keras")
    model.save(final_model_path)
    print(f"Final model saved to: {final_model_path}")

    # Generate predictions
    predictions = model.predict(x_test_padded, verbose=0)
    predictions_labels = np.argmax(predictions, axis=1)

    # Add the encoded predictions to the test DataFrame
    df_test['Category'] = predictions_labels
    # Ensure 'Id' column exists, if not, create it
    if 'Id' not in df_test.columns:
        df_test['Id'] = df_test.index + 1  # Assuming Id starts from 1

    # Save encoded predictions to a CSV file
    predictions_path = os.path.join(model_directory, f"{model_name}_encoded_predictions.csv")
    df_test[['Id', 'Category']].to_csv(predictions_path, index=False)
    print(f"Encoded predictions saved to: {predictions_path}")


#transformer

In [None]:
def build_transformer_model(vocab_size, embedding_dim, max_len, embedding_matrix):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(inputs)

    attention_1 = MultiHeadAttention(
        num_heads=8, key_dim=embedding_dim // 8
    )(embedding, embedding)
    add_1 = Add()([attention_1, embedding])
    norm_1 = LayerNormalization()(add_1)

    dense_1 = Dense(embedding_dim * 2, activation='relu')(norm_1)
    dense_2 = Dense(embedding_dim)(dense_1)
    add_2 = Add()([dense_2, norm_1])
    norm_2 = LayerNormalization()(add_2)

    pooled = GlobalAveragePooling1D()(norm_2)
    dropout = Dropout(0.3)(pooled)
    outputs = Dense(len(category_to_label), activation='softmax')(dropout)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


In [None]:
transformer_model = build_transformer_model(num_words, embedding_dim, max_len, embedding_matrix)
train_save_model_and_predictions(transformer_model, "transformer_model")

Epoch 1/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 18ms/step - accuracy: 0.5049 - loss: 1.2722 - val_accuracy: 0.6688 - val_loss: 0.8776 - learning_rate: 1.0000e-04
Epoch 2/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.6616 - loss: 0.8777 - val_accuracy: 0.6850 - val_loss: 0.8239 - learning_rate: 1.0000e-04
Epoch 3/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6980 - loss: 0.8022 - val_accuracy: 0.6871 - val_loss: 0.8170 - learning_rate: 1.0000e-04
Epoch 4/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.7148 - loss: 0.7623 - val_accuracy: 0.6831 - val_loss: 0.8259 - learning_rate: 1.0000e-04
Epoch 5/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.7253 - loss: 0.7366 - val_accuracy: 0.6787 - val_loss: 0.8196 - learning_rate: 1.0000e-04
Epoch 6/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━

# model 2

In [None]:
# Model definitions
def build_enhanced_model(vocab_size, embedding_dim, max_len, embedding_matrix):
    model = tf.keras.Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                 weights=[embedding_matrix],
                 trainable=True),
        SpatialDropout1D(0.2),
        Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'),
        BatchNormalization(),
        Bidirectional(LSTM(128, return_sequences=True)), # Output a sequence of hidden states
        Dropout(0.3),
        GlobalMaxPooling1D(),  # Add this line to get a single representation for the sequence
        Dense(len(category_to_label), activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=1e-3),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
enhanced_model = build_enhanced_model(num_words, embedding_dim, max_len, embedding_matrix)


In [None]:
train_save_model_and_predictions(enhanced_model, "enhanced_model")

Epoch 1/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - accuracy: 0.6023 - loss: 1.0187 - val_accuracy: 0.6882 - val_loss: 0.8283 - learning_rate: 0.0010
Epoch 2/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 19ms/step - accuracy: 0.7219 - loss: 0.7412 - val_accuracy: 0.7075 - val_loss: 0.7853 - learning_rate: 0.0010
Epoch 3/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 18ms/step - accuracy: 0.7800 - loss: 0.5965 - val_accuracy: 0.7075 - val_loss: 0.7693 - learning_rate: 0.0010
Epoch 4/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 18ms/step - accuracy: 0.8246 - loss: 0.4768 - val_accuracy: 0.7063 - val_loss: 0.7917 - learning_rate: 0.0010
Epoch 5/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - accuracy: 0.8632 - loss: 0.3750 - val_accuracy: 0.6920 - val_loss: 0.8426 - learning_rate: 0.0010
Epoch 6/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

#residual_cnn_model

In [None]:
def build_residual_cnn_model(vocab_size, embedding_dim, max_len, embedding_matrix):
    inputs = Input(shape=(max_len,))
    x = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(inputs)

    proj = Conv1D(128, 1, padding='same')(x)

    conv1 = Conv1D(128, 3, padding='same')(x)
    conv1 = BatchNormalization()(conv1)
    conv1 = tf.keras.activations.relu(conv1)
    conv2 = Conv1D(128, 3, padding='same')(conv1)
    conv2 = BatchNormalization()(conv2)

    res1 = Add()([proj, conv2])
    res1 = tf.keras.activations.relu(res1)

    conv3 = Conv1D(128, 3, padding='same')(res1)
    conv3 = BatchNormalization()(conv3)
    conv3 = tf.keras.activations.relu(conv3)
    conv4 = Conv1D(128, 3, padding='same')(conv3)
    conv4 = BatchNormalization()(conv4)

    res2 = Add()([res1, conv4])
    res2 = tf.keras.activations.relu(res2)

    pooled = GlobalMaxPooling1D()(res2)
    dropout = Dropout(0.3)(pooled)
    dense = Dense(256, activation='relu')(dropout)
    outputs = Dense(len(category_to_label), activation='softmax')(dense)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


In [None]:
residual_cnn_model = build_residual_cnn_model(num_words, embedding_dim, max_len, embedding_matrix)
train_save_model_and_predictions(residual_cnn_model, "residual_cnn_model")

Epoch 1/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 17ms/step - accuracy: 0.3380 - loss: 2.1167 - val_accuracy: 0.6165 - val_loss: 0.9749 - learning_rate: 1.0000e-04
Epoch 2/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.5751 - loss: 1.1126 - val_accuracy: 0.6533 - val_loss: 0.9005 - learning_rate: 1.0000e-04
Epoch 3/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6378 - loss: 0.9632 - val_accuracy: 0.6665 - val_loss: 0.8741 - learning_rate: 1.0000e-04
Epoch 4/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.6714 - loss: 0.8732 - val_accuracy: 0.6663 - val_loss: 0.8569 - learning_rate: 1.0000e-04
Epoch 5/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6921 - loss: 0.8105 - val_accuracy: 0.6817 - val_loss: 0.8283 - learning_rate: 1.0000e-04
Epoch 6/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━

#bidirectional lstm

In [None]:
def build_bilstm_attention_model(vocab_size, embedding_dim, max_len, embedding_matrix):
    inputs = Input(shape=(max_len,))
    embedding = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=True
    )(inputs)

    bilstm = Bidirectional(LSTM(embedding_dim // 2, return_sequences=True))(embedding)
    attention = MultiHeadAttention(
        num_heads=8, key_dim=embedding_dim // 8
    )(bilstm, bilstm)

    attention_add = Add()([attention, bilstm])
    attention_norm = LayerNormalization()(attention_add)

    pooled = GlobalAveragePooling1D()(attention_norm)
    dropout = Dropout(0.3)(pooled)
    dense = Dense(256, activation='relu')(dropout)
    outputs = Dense(len(category_to_label), activation='softmax')(dense)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
bilstm_attention_model = build_bilstm_attention_model(num_words, embedding_dim, max_len, embedding_matrix)
train_save_model_and_predictions(bilstm_attention_model, "bilstm_attention_model")

Epoch 1/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 23ms/step - accuracy: 0.5248 - loss: 1.1701 - val_accuracy: 0.6575 - val_loss: 0.8908 - learning_rate: 1.0000e-04
Epoch 2/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.6827 - loss: 0.8355 - val_accuracy: 0.6789 - val_loss: 0.8476 - learning_rate: 1.0000e-04
Epoch 3/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.7085 - loss: 0.7785 - val_accuracy: 0.6979 - val_loss: 0.7940 - learning_rate: 1.0000e-04
Epoch 4/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.7231 - loss: 0.7329 - val_accuracy: 0.7056 - val_loss: 0.7881 - learning_rate: 1.0000e-04
Epoch 5/30
[1m595/595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 22ms/step - accuracy: 0.7420 - loss: 0.6992 - val_accuracy: 0.6941 - val_loss: 0.8140 - learning_rate: 1.0000e-04
Epoch 6/30
[1m595/595[0m [32m━━━━━━━━

#ensemble model

In [None]:
# # Ensemble prediction function
# def ensemble_predict(models, x_test):
#     predictions = np.zeros((len(x_test), len(category_to_label)))
#     for model in models:
#         pred = model.predict(x_test, verbose=0)
#         predictions += pred

#     predictions /= len(models)
#     return np.argmax(predictions, axis=1)

In [None]:
# Ensemble predict function: takes predictions from all models and averages them
def ensemble_predict(models, x_test):
    # Get predictions from each model
    predictions = np.zeros((len(x_test), len(category_to_label)))

    for model in models:
        model_predictions = model.predict(x_test, verbose=0)
        predictions += model_predictions  # Sum predictions

    # Average the predictions
    predictions /= len(models)

    # Return the class with the highest probability (encoded as category)
    return np.argmax(predictions, axis=1)

In [None]:
# Ensemble models and save predictions function
def ensemble_models_and_save_predictions(models, x_test, output_directory='Ensemble_Output'):
    # Create output directory
    os.makedirs(output_directory, exist_ok=True)

    # Get predictions from the ensemble
    ensemble_predictions = ensemble_predict(models, x_test)

    # Save predictions to CSV
    save_predictions_to_csv(ensemble_predictions, output_directory)

    print(f"Ensemble predictions saved to: {output_directory}/predictions.csv")

In [None]:
# List of models
models = [transformer_model, enhanced_model, residual_cnn_model, bilstm_attention_model]


# Run ensemble model and save predictions
ensemble_models_and_save_predictions(models, x_test_padded, output_directory='Ensemble_Output')

Predictions saved to: Ensemble_Output/predictions.csv
Ensemble predictions saved to: Ensemble_Output/predictions.csv


#test script

In [None]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import os
from tqdm import tqdm

# Load the test dataset
df_test = pd.read_csv("test.csv")

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function for text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words)
    return text

# Remove missing values
df_test = df_test.dropna(subset=['Discussion'])

# Apply preprocessing to the test data
df_test['Discussion'] = df_test['Discussion'].apply(preprocess_text)

# Load the pre-trained tokenizer (saved during the training phase)
with open('tokenizer.pkl', 'rb') as file:
    tokenizer = pickle.load(file)

# Tokenize and pad the test data
max_len = 100
x_test_sequences = tokenizer.texts_to_sequences(df_test['Discussion'])
x_test_padded = pad_sequences(x_test_sequences, maxlen=max_len, padding='post', truncating='post')

# List of model directories (where models were saved in SavedModel format)
model_paths = [
    'Output/transformer_model/transformer_model_final_model.keras',
    'Output/enhanced_model/enhanced_model_final_model.keras',
    'Output/residual_cnn_model/residual_cnn_model_final_model.keras',
    'Output/bilstm_attention_model/bilstm_attention_model_final_model.keras'
]

# Load the models
models = []
for model_path in model_paths:
    model = tf.keras.models.load_model(model_path)  # Load each model from the specified path
    models.append(model)

# Function to predict using all models
def predict_on_test_data(models, x_test_padded):
    predictions = []
    for model in models:
        model_predictions = model.predict(x_test_padded, verbose=0)
        predictions.append(model_predictions)

    # Average the predictions from all models (for ensemble method)
    avg_predictions = np.mean(predictions, axis=0)

    # Get the predicted labels (category with the highest probability)
    predicted_labels = np.argmax(avg_predictions, axis=1)
    return predicted_labels

# Generate predictions on the test data
predictions = predict_on_test_data(models, x_test_padded)

# Create a DataFrame with the results (Id and Category)
df_test['Category'] = predictions

# Ensure 'Id' column exists, if not, create it
if 'Id' not in df_test.columns:
    df_test['Id'] = df_test.index + 1  # Assuming Id starts from 1

# Save the predictions to a CSV file
output_directory = 'tested_Output'  # Folder to save predictions
os.makedirs(output_directory, exist_ok=True)
predictions_path = os.path.join(output_directory, 'predictions.csv')
df_test[['Id', 'Category']].to_csv(predictions_path, index=False)

print(f"Predictions saved to: {predictions_path}")


Predictions saved to: tested_Output/predictions.csv
