In [1]:
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.callbacks import EarlyStopping
from transformers import AutoTokenizer
import nltk

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('omw-1.4')



# Load dataset
data = pd.read_csv('/kaggle/input/dataset/train.csv')

# Map the category column
category_mapping = {'Politics': 0, 'Sports': 1, 'Media': 2, 'Market & Economy': 3, 'STEM': 4}
data['Category'] = data['Category'].replace(category_mapping)

# Drop nulls
data.dropna(subset=['Discussion'], inplace=True)


data.drop(columns=['SampleID'], inplace=True)

# Convert text to lowercase
data['Discussion'] = data['Discussion'].str.lower()

# Remove duplicates
data = data.drop_duplicates(subset=['Discussion', 'Category'])

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


In [2]:
# Load the stop words
stop_words = set(stopwords.words('english'))
def contains_stopwords(text):
    words = text.split()
    return any(word.lower() in stop_words for word in words)

# Filter the dataset to retain only rows with stop words
data = data[data['Discussion'].apply(contains_stopwords)].reset_index(drop=True)

In [3]:
# Remove URLs and special characters
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
data['Discussion'] = data['Discussion'].replace({r'\\n': ' '}, regex=True)
data['Discussion'] = data['Discussion'].apply(lambda x: re.sub(url_pattern, '', x))
data['Discussion'] = data['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
data['Discussion'] = data['Discussion'].apply(lambda x: x.strip())  # Remove leading/trailing spaces
data['Discussion'] = data['Discussion'].apply(lambda x: re.sub(r'\s+', ' ', x))  # Replace multiple spaces with a single space

# if any became empyt after cleaning we drop it 
data.dropna(subset=['Discussion'], inplace=True)

In [325]:

# # Remove stop words
# stop_words = set(stopwords.words("english"))
# data['Discussion'] = data['Discussion'].apply(lambda x: ' '.join(
#     [word for word in word_tokenize(x) if word not in stop_words]
# ))


In [4]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['Discussion'], data['Category'],
    test_size=0.2, random_state=42, stratify=data['Category'], shuffle=True
)

In [5]:
# Tokenization using Hugging Face Transformers (BERT Tokenizer)
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

max_length = 128  #

# Tokenize the text
def tokenize_texts(texts, tokenizer, max_length=max_length):
    return tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

# Tokenize training and testing data
X_train_tokenized = tokenize_texts(X_train, tokenizer, max_length=max_length)
X_test_tokenized = tokenize_texts(X_test, tokenizer, max_length=max_length)

# Extract token IDs for embedding input
X_train_ids = X_train_tokenized['input_ids']
X_test_ids = X_test_tokenized['input_ids']

# Print shape of the input data for validation
print(f"X_train_ids shape: {X_train_ids.shape}, y_train shape: {y_train.shape}")
print(f"X_test_ids shape: {X_test_ids.shape}, y_test shape: {y_test.shape}")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



X_train_ids shape: (18292, 128), y_train shape: (18292,)
X_test_ids shape: (4573, 128), y_test shape: (4573,)


In [6]:
# take y_train and y_test and retrun class-specific y_train and y_test (new)
def make_it_class_specific(y_train, y_test, choosen):
    y_train = y_train.apply(lambda x: 1 if x == choosen else 0)
    y_test = y_test.apply(lambda x: 1 if x == choosen else 0)
    return y_train, y_test
sy_test = [y_test.copy()] * 5
sy_train = [y_train.copy()] * 5
y_output_test = [y_test.copy()] * 6
y_output_train = [y_train.copy()] * 6
test_acc = [0] * 5

In [9]:
import numpy as np
# Paths and configurations
embedding_dim = 200  # 100D GloVe embeddings
embedding_index = {}
glove_file_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.200d.txt'  # Path to your GloVe file

# Load GloVe embeddings into memory
with open(glove_file_path, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

tokenizer_vocab = tokenizer.get_vocab()

max_vocab_size=1000000
# Prepare the embedding matrix
embedding_matrix = np.zeros((min(max_vocab_size, len(tokenizer_vocab)), embedding_dim))
for word, i in tokenizer_vocab.items():
    if i < max_vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
# imports

from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Bidirectional
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, Bidirectional, GRU, Attention, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.initializers import Constant

In [331]:
class_num = 0
sy_train[class_num], sy_test[class_num] = make_it_class_specific(y_train, y_test, class_num)

In [332]:


# Input layer
input_layer = Input(shape=(max_length,))

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=min(max_vocab_size, len(tokenizer_vocab)), 
                            output_dim=embedding_dim, 
                            input_length=max_length,
                            weights=[embedding_matrix],  # Pre-trained weights
                            trainable=True)(input_layer)  # Freeze the embeddings
embedding_layer = SpatialDropout1D(0.3)(embedding_layer)

# CNN Layers
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)

# Bidirectional GRU Layers with more units
gru_layer = Bidirectional(GRU(128, return_sequences=True))(cnn_layer)
gru_layer = Bidirectional(GRU(64, return_sequences=True))(gru_layer)
gru_layer = GRU(32, return_sequences=False)(gru_layer)

# Fully connected layers with larger sizes
dense_layer = Dense(128, activation='relu')(gru_layer)
dropout_layer = Dropout(0.5)(dense_layer)

# Output layer
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=6e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

# Learning rate schedule callback
def lr_schedule(epoch, lr):
    if epoch < 5:
        return lr * 1.1  # Gradual increase for first few epochs
    return lr * 0.9  # Gradual decrease after epoch 5
lr_callback = LearningRateScheduler(lr_schedule)

# Train the model
history = model.fit(
    X_train_ids, sy_train[class_num],
    validation_data=(X_test_ids, sy_test[class_num]),
    epochs=70,
    batch_size=64,
    callbacks=[lr_callback, early_stopping],
    verbose=1
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_ids, sy_test[class_num], verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate predictions for the test set and training set
y_output_test[class_num] = model.predict(X_test_ids, verbose=1)
y_output_train[class_num] = model.predict(X_train_ids, verbose=1)
print(y_output_train[class_num])
print(y_output_test[class_num])
print("#####")
print(len(y_test))
print(len(y_output_test[class_num]))




Epoch 1/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 25ms/step - accuracy: 0.8254 - loss: 0.4870 - val_accuracy: 0.8876 - val_loss: 0.2807 - learning_rate: 6.6000e-04
Epoch 2/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.8942 - loss: 0.2679 - val_accuracy: 0.9007 - val_loss: 0.2462 - learning_rate: 7.2600e-04
Epoch 3/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.9215 - loss: 0.2116 - val_accuracy: 0.9071 - val_loss: 0.2418 - learning_rate: 7.9860e-04
Epoch 4/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9399 - loss: 0.1541 - val_accuracy: 0.9031 - val_loss: 0.2634 - learning_rate: 8.7846e-04
Epoch 5/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.9572 - loss: 0.1188 - val_accuracy: 0.8924 - val_loss: 0.3092 - learning_rate: 9.6631e-04
Epoch 6/70
[1m286/286[0m [32m━━━━━━━━━━━━

In [333]:
model.save("my_model0.h5")

In [334]:
class_num = 1
sy_train[class_num], sy_test[class_num] = make_it_class_specific(y_train, y_test, class_num)

In [335]:
from tensorflow.keras.layers import Bidirectional

# Input layer
input_layer = Input(shape=(max_length,))

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=min(max_vocab_size, len(tokenizer_vocab)), 
                            output_dim=embedding_dim, 
                            input_length=max_length,
                            weights=[embedding_matrix],  # Pre-trained weights
                            trainable=True)(input_layer)  # Freeze the embeddings
embedding_layer = SpatialDropout1D(0.3)(embedding_layer)

# CNN Layers
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)

# Bidirectional GRU Layers with more units
gru_layer = Bidirectional(GRU(128, return_sequences=True))(cnn_layer)
gru_layer = Bidirectional(GRU(64, return_sequences=True))(gru_layer)
gru_layer = GRU(32, return_sequences=False)(gru_layer)

# Fully connected layers with larger sizes
dense_layer = Dense(128, activation='relu')(gru_layer)
dropout_layer = Dropout(0.5)(dense_layer)

# Output layer
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=6e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

# Learning rate schedule callback
def lr_schedule(epoch, lr):
    if epoch < 5:
        return lr * 1.1  # Gradual increase for first few epochs
    return lr * 0.9  # Gradual decrease after epoch 5
lr_callback = LearningRateScheduler(lr_schedule)

# Train the model
history = model.fit(
    X_train_ids, sy_train[class_num],
    validation_data=(X_test_ids, sy_test[class_num]),
    epochs=70,
    batch_size=64,
    callbacks=[lr_callback, early_stopping],
    verbose=1
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_ids, sy_test[class_num], verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate predictions for the test set and training set
y_output_test[class_num] = model.predict(X_test_ids, verbose=1)
y_output_train[class_num] = model.predict(X_train_ids, verbose=1)
print(y_output_train[class_num])
print(y_output_test[class_num])
print("#####")
print(len(y_test))
print(len(y_output_test[class_num]))


Epoch 1/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 24ms/step - accuracy: 0.7815 - loss: 0.5415 - val_accuracy: 0.8909 - val_loss: 0.3185 - learning_rate: 6.6000e-04
Epoch 2/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.9038 - loss: 0.2771 - val_accuracy: 0.9173 - val_loss: 0.2324 - learning_rate: 7.2600e-04
Epoch 3/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9290 - loss: 0.1985 - val_accuracy: 0.9219 - val_loss: 0.2113 - learning_rate: 7.9860e-04
Epoch 4/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.9470 - loss: 0.1504 - val_accuracy: 0.9189 - val_loss: 0.2262 - learning_rate: 8.7846e-04
Epoch 5/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.9563 - loss: 0.1212 - val_accuracy: 0.9033 - val_loss: 0.2574 - learning_rate: 9.6631e-04
Epoch 6/70
[1m286/286[0m [32m━━━━━━━━━━━━

In [336]:
model.save("my_model1.h5")

In [337]:
class_num = 2
sy_train[class_num], sy_test[class_num] = make_it_class_specific(y_train, y_test, class_num)

In [338]:
from tensorflow.keras.layers import Bidirectional

# Input layer
input_layer = Input(shape=(max_length,))

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=min(max_vocab_size, len(tokenizer_vocab)), 
                            output_dim=embedding_dim, 
                            input_length=max_length,
                            weights=[embedding_matrix],  # Pre-trained weights
                            trainable=True)(input_layer)  # Freeze the embeddings
embedding_layer = SpatialDropout1D(0.3)(embedding_layer)

# CNN Layers
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)

# Bidirectional GRU Layers with more units
gru_layer = Bidirectional(GRU(128, return_sequences=True))(cnn_layer)
gru_layer = Bidirectional(GRU(64, return_sequences=True))(gru_layer)
gru_layer = GRU(32, return_sequences=False)(gru_layer)

# Fully connected layers with larger sizes
dense_layer = Dense(128, activation='relu')(gru_layer)
dropout_layer = Dropout(0.5)(dense_layer)

# Output layer
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=6e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

# Learning rate schedule callback
def lr_schedule(epoch, lr):
    if epoch < 5:
        return lr * 1.1  # Gradual increase for first few epochs
    return lr * 0.9  # Gradual decrease after epoch 5
lr_callback = LearningRateScheduler(lr_schedule)

# Train the model
history = model.fit(
    X_train_ids, sy_train[class_num],
    validation_data=(X_test_ids, sy_test[class_num]),
    epochs=70,
    batch_size=64,
    callbacks=[lr_callback, early_stopping],
    verbose=1
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_ids, sy_test[class_num], verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate predictions for the test set and training set
y_output_test[class_num] = model.predict(X_test_ids, verbose=1)
y_output_train[class_num] = model.predict(X_train_ids, verbose=1)
print(y_output_train[class_num])
print(y_output_test[class_num])
print("#####")
print(len(y_test))
print(len(y_output_test[class_num]))


Epoch 1/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.8235 - loss: 0.4862 - val_accuracy: 0.8590 - val_loss: 0.3152 - learning_rate: 6.6000e-04
Epoch 2/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.8677 - loss: 0.3101 - val_accuracy: 0.8898 - val_loss: 0.2625 - learning_rate: 7.2600e-04
Epoch 3/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.9024 - loss: 0.2366 - val_accuracy: 0.8891 - val_loss: 0.2567 - learning_rate: 7.9860e-04
Epoch 4/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.9183 - loss: 0.2014 - val_accuracy: 0.8865 - val_loss: 0.2677 - learning_rate: 8.7846e-04
Epoch 5/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9355 - loss: 0.1573 - val_accuracy: 0.8810 - val_loss: 0.3021 - learning_rate: 9.6631e-04
Epoch 6/70
[1m286/286[0m [32m━━━━━━━━━━━━

In [339]:
model.save("my_model2.h5")

In [340]:
class_num = 3
sy_train[class_num], sy_test[class_num] = make_it_class_specific(y_train, y_test, class_num)

In [341]:
from tensorflow.keras.layers import Bidirectional

# Input layer
input_layer = Input(shape=(max_length,))

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=min(max_vocab_size, len(tokenizer_vocab)), 
                            output_dim=embedding_dim, 
                            input_length=max_length,
                            weights=[embedding_matrix],  # Pre-trained weights
                            trainable=True)(input_layer)  # Freeze the embeddings
embedding_layer = SpatialDropout1D(0.3)(embedding_layer)

# CNN Layers
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)

# Bidirectional GRU Layers with more units
gru_layer = Bidirectional(GRU(128, return_sequences=True))(cnn_layer)
gru_layer = Bidirectional(GRU(64, return_sequences=True))(gru_layer)
gru_layer = GRU(32, return_sequences=False)(gru_layer)

# Fully connected layers with larger sizes
dense_layer = Dense(128, activation='relu')(gru_layer)
dropout_layer = Dropout(0.5)(dense_layer)

# Output layer
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=6e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

# Learning rate schedule callback
def lr_schedule(epoch, lr):
    if epoch < 5:
        return lr * 1.1  # Gradual increase for first few epochs
    return lr * 0.9  # Gradual decrease after epoch 5
lr_callback = LearningRateScheduler(lr_schedule)

# Train the model
history = model.fit(
    X_train_ids, sy_train[class_num],
    validation_data=(X_test_ids, sy_test[class_num]),
    epochs=70,
    batch_size=64,
    callbacks=[lr_callback, early_stopping],
    verbose=1
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_ids, sy_test[class_num], verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate predictions for the test set and training set
y_output_test[class_num] = model.predict(X_test_ids, verbose=1)
y_output_train[class_num] = model.predict(X_train_ids, verbose=1)
print(y_output_train[class_num])
print(y_output_test[class_num])
print("#####")
print(len(y_test))
print(len(y_output_test[class_num]))


Epoch 1/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.7775 - loss: 0.5569 - val_accuracy: 0.7794 - val_loss: 0.5257 - learning_rate: 6.6000e-04
Epoch 2/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.7836 - loss: 0.5185 - val_accuracy: 0.8183 - val_loss: 0.4365 - learning_rate: 7.2600e-04
Epoch 3/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.8290 - loss: 0.4119 - val_accuracy: 0.8463 - val_loss: 0.3718 - learning_rate: 7.9860e-04
Epoch 4/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.8634 - loss: 0.3325 - val_accuracy: 0.8504 - val_loss: 0.3793 - learning_rate: 8.7846e-04
Epoch 5/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.8897 - loss: 0.2717 - val_accuracy: 0.8450 - val_loss: 0.3755 - learning_rate: 9.6631e-04
Epoch 6/70
[1m286/286[0m [32m━━━━━━━━━━━━

In [342]:
model.save("my_model3.h5")

In [343]:
class_num = 4
sy_train[class_num], sy_test[class_num] = make_it_class_specific(y_train, y_test, class_num)

In [344]:
from tensorflow.keras.layers import Bidirectional
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, Bidirectional, GRU, Attention, GlobalAveragePooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow.keras.initializers import Constant
# Input layer
input_layer = Input(shape=(max_length,))

# Embedding layer with pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=min(max_vocab_size, len(tokenizer_vocab)), 
                            output_dim=embedding_dim, 
                            input_length=max_length,
                            weights=[embedding_matrix],  # Pre-trained weights
                            trainable=True)(input_layer)  # Freeze the embeddings
embedding_layer = SpatialDropout1D(0.3)(embedding_layer)

# CNN Layers
cnn_layer = Conv1D(filters=64, kernel_size=3, activation='relu')(embedding_layer)
cnn_layer = MaxPooling1D(pool_size=2)(cnn_layer)

# Bidirectional GRU Layers with more units
gru_layer = Bidirectional(GRU(128, return_sequences=True))(cnn_layer)
gru_layer = Bidirectional(GRU(64, return_sequences=True))(gru_layer)
gru_layer = GRU(32, return_sequences=False)(gru_layer)

# Fully connected layers with larger sizes
dense_layer = Dense(128, activation='relu')(gru_layer)
dropout_layer = Dropout(0.5)(dense_layer)

# Output layer
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

# Compile the model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=6e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Define EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

# Learning rate schedule callback
def lr_schedule(epoch, lr):
    if epoch < 5:
        return lr * 1.1  # Gradual increase for first few epochs
    return lr * 0.9  # Gradual decrease after epoch 5
lr_callback = LearningRateScheduler(lr_schedule)

# Train the model
history = model.fit(
    X_train_ids, sy_train[class_num],
    validation_data=(X_test_ids, sy_test[class_num]),
    epochs=70,
    batch_size=64,
    callbacks=[lr_callback, early_stopping],
    verbose=1
)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_ids, sy_test[class_num], verbose=1)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

# Generate predictions for the test set and training set
y_output_test[class_num] = model.predict(X_test_ids, verbose=1)
y_output_train[class_num] = model.predict(X_train_ids, verbose=1)
print(y_output_train[class_num])
print(y_output_test[class_num])
print("#####")
print(len(y_test))
print(len(y_output_test[class_num]))


Epoch 1/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.7791 - loss: 0.5472 - val_accuracy: 0.8859 - val_loss: 0.3108 - learning_rate: 6.6000e-04
Epoch 2/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 23ms/step - accuracy: 0.8778 - loss: 0.3210 - val_accuracy: 0.9025 - val_loss: 0.2576 - learning_rate: 7.2600e-04
Epoch 3/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9099 - loss: 0.2470 - val_accuracy: 0.9167 - val_loss: 0.2496 - learning_rate: 7.9860e-04
Epoch 4/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9285 - loss: 0.1997 - val_accuracy: 0.9167 - val_loss: 0.2236 - learning_rate: 8.7846e-04
Epoch 5/70
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.9407 - loss: 0.1669 - val_accuracy: 0.8977 - val_loss: 0.2679 - learning_rate: 9.6631e-04
Epoch 6/70
[1m286/286[0m [32m━━━━━━━━━━━━

In [345]:
model.save("my_model4.h5")

In [346]:
model.save("my_model4.h5")
yyt = y_test.copy()
yy = y_train.copy()

In [347]:
print("#####")
print(len(y_test))
print(len(y_output_test[0]))
print(len(y_output_test[1]))
print(len(y_output_test[2]))
print(len(y_output_test[3]))
print(len(y_output_test[4]))
print(len(y_output_test[5]))
# print(len(y_output_test[6]))

#####
4573
4573
4573
4573
4573
4573
4573


In [348]:
ttt = [] * len(data['Discussion'])
print(len(ttt))
tttt = [0] * len(y_test)


0


In [349]:
print("#####")
print(len(y_test))
print(len(y_output_test[-1]))
for i in range(len(y_output_test[0])):
  mx, mxi = -1, 0
  for c in range(5):
    if mx < y_output_test[c][i]:
      mx = y_output_test[c][i]
      mxi = c
  y_output_test[-1][i] = mxi
  tttt[i] = mxi 

for i in range(len(y_output_train[0])):
  mx, mxi = -1, 0
  for c in range(5):
    if mx < y_output_train[c][i]:
      mx = y_output_train[c][i]
      mxi = c
  y_output_train[-1][i] = mxi
print("#####")
print(len(y_test))
print(len(y_output_test[-1]))
print(len(tttt))

#categorical_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

# # Calculate accuracy for the test set
# categorical_accuracy.update_state(y_test, y_output_test[-1])
# test_accuracy = categorical_accuracy.result().numpy()
# print(f"Test Accuracy: {test_accuracy}")

# # Reset states for a new calculation
# # categorical_accuracy.reset_states()

# # Calculate accuracy for the training set
# categorical_accuracy.update_state(y_train, y_output_train[-1])
# train_accuracy = categorical_accuracy.result().numpy()
# print(f"Train Accuracy: {train_accuracy}")

# y_test.head(10)
# y_output_test[-1].head(10)

print(y_test)
print(y_output_test[-1])

acc_test = (y_test == tttt).mean()
print(acc_test)

print("#####")
print(f'len(y_test) = {len(y_test)}')
print(f'len(tttt) = {len(tttt)}')


# acc_train = (y_train == tttt).mean()
# print(acc_train)


##%%
correct_predictions_test = 0

# Loop through each element in y_test and y_output_test[-1]
for true, predicted in zip(y_test, tttt):
    if true == predicted:
        correct_predictions_test += 1

# Calculate accuracy by dividing the correct predictions by the total length
acc_test = correct_predictions_test / len(tttt)
print(f'acc_test =                                  {acc_test}')

# Print separator
print("#####")
#$%%%
print("#####")
print(len(y_test))


# print(y_output_test[-1])
# print(y_test)

# print("#####")
# print(len(y_test))


#####
4573
4573
#####
4573
8261
4573
16103    4
5079     4
10026    4
10202    1
3287     0
        ..
4435     3
525      4
4282     1
10130    2
11168    1
Name: Category, Length: 4573, dtype: int64
16103    4
5079     4
10026    4
10202    1
3287     3
        ..
4568     4
4569     4
4570     3
4571     2
4572     1
Name: Category, Length: 8261, dtype: int64
0.7299365842991472
#####
len(y_test) = 4573
len(tttt) = 4573
acc_test =                                  0.7299365842991472
#####
#####
4573


In [350]:
# from tensorflow.keras.models import load_model
# model = load_model('my_model.h5')


# history_continued = model.fit(
#     X_train_ids, y_train,
#     validation_data=(X_test_ids, y_test),
#     epochs=5,  # Number of additional epochs
#     batch_size=128,  # Keep the batch size the same
#     callbacks=[early_stopping],  # Reuse the callbacks
#     class_weight=class_weights,  # Reuse class weights
#     verbose=1
# )


In [351]:
from tensorflow.keras.models import load_model

test_data = pd.read_csv('/kaggle/input/dataset/test.csv')

# Convert text to lowercase
test_data['Discussion'] = test_data['Discussion'].str.lower()
# Remove URLs and special characters
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
test_data['Discussion'] = test_data['Discussion'].replace({r'\\n': ' '}, regex=True)
test_data['Discussion'] = test_data['Discussion'].apply(lambda x: re.sub(url_pattern, '', x))
test_data['Discussion'] = test_data['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
test_data['Discussion'] = test_data['Discussion'].apply(lambda x: x.strip())  # Remove leading/trailing spaces
test_data['Discussion'] = test_data['Discussion'].apply(lambda x: re.sub(r'\s+', ' ', x))  # Replace multiple spaces with a single space

# # Remove stop words
# test_data['Discussion'] = test_data['Discussion'].apply(lambda x: ' '.join(
#     [word for word in word_tokenize(x) if word not in stop_words]
# ))

# Lemmatize text
# lemmatizer = WordNetLemmatizer()
# test_data['Discussion'] = test_data['Discussion'].apply(lambda x: ' '.join(
#     [lemmatizer.lemmatize(word) for word in word_tokenize(x)]
# ))


In [352]:
y_output_test = [test_data.copy()] * 6
# Load the trained model
for i in range(5):
  model = load_model(f'my_model{i}.h5')
  # Tokenize training and testing data
  Xt = tokenize_texts(test_data['Discussion'], tokenizer, max_length=max_length)
  # Extract token IDs for embedding input
  Xti = Xt['input_ids']
  y_output_test[i] = model.predict(Xti, verbose=1)
ttttt = [0] * len( y_output_test[0])
print(f'len(ttttt)= {len(ttttt)}')

for i in range(len(y_output_test[0])):
  mx, mxi = -1, 0
  for c in range(5):
    if mx < y_output_test[c][i]:
      mx = y_output_test[c][i]
      mxi = c
  ttttt[i] = mxi



[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
len(ttttt)= 10557


In [353]:
print(f'len(ttttt)= {len(ttttt)}')

len(ttttt)= 10557


In [354]:

test_data['Category'] = ttttt

test_data['Category'].head(20)

# Save results to a new CSV file
output_file = 'predicted_categories.csv'
test_data[['SampleID', 'Category']].to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")

Predictions saved to predicted_categories.csv


In [355]:
    import zipfile
    import os
    
    # Define the directory and files to compress
    directory = ''
    files_to_compress = [
        'my_model0.h5',
        'my_model1.h5',
        'my_model2.h5',
        'my_model3.h5',
        'my_model4.h5',
        'predicted_categories.csv'
    ]
    output_zip = os.path.join(directory, 'compressed_models_and_results.zip')
    
    # Create a ZIP file and add files to it
    with zipfile.ZipFile(output_zip, 'w') as zipf:
        for file in files_to_compress:
            file_path = os.path.join(directory, file)
            if os.path.exists(file_path):
                zipf.write(file_path, arcname=file)  # Use arcname to keep just the filename in the ZIP
            else:
                print(f"File not found: {file_path}")
    
    print(f"All files have been compressed into {output_zip}")


All files have been compressed into compressed_models_and_results.zip


In [356]:
# x.print(output_zip)
print(output_zip)


compressed_models_and_results.zip
