In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from tensorflow.keras.models import load_model
import pickle
import keras_tuner as kt
import time
import psutil

In [None]:
# Load the dataset from CSV files
train_df = pd.read_csv('../../Data/train.csv', header=0, names=['class', 'title', 'description'])
test_df = pd.read_csv('../../Data/test.csv', header=0, names=['class', 'title', 'description'])

In [None]:
# Combine title and description for better context
train_df['text'] = train_df['title'] + " " + train_df['description']
test_df['text'] = test_df['title'] + " " + test_df['description']


In [None]:
# Extract texts and labels
train_texts = train_df['text'].tolist()
train_labels = train_df['class'].to_numpy() - 1  # Convert classes to 0-based index

test_texts = test_df['text'].tolist()
test_labels = test_df['class'].to_numpy() - 1  # Convert classes to 0-based index


In [None]:
# Parameters
vocab_size = 70338  # Size of the vocabulary
embedding_dim_50 = 50
embedding_dim_100 = 100  # Dimension of the word embeddings
embedding_dim_200 = 200 
embedding_dim_300 = 300
max_length = 200  # Maximum length of the input sequences
num_classes = 4  # AG News has 4 classes

In [None]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [None]:
# Pad sequences to ensure uniform input size
X_train = pad_sequences(train_sequences, maxlen=max_length)
X_test = pad_sequences(test_sequences, maxlen=max_length)

In [None]:
# Convert labels to numpy arrays
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [None]:
# Load GloVe embeddings
embeddings_index_50 = {}
with open('../../Data/glove.6B/glove.6B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index_50[word] = coefs

embeddings_index_100 = {}
with open('../../Data/glove.6B/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index_100[word] = coefs

embeddings_index_200 = {}
with open('../../Data/glove.6B/glove.6B.200d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index_200[word] = coefs

embeddings_index_300 = {}
with open('../../Data/glove.6B/glove.6B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index_300[word] = coefs

In [None]:
# Create embedding matrix
matched = 0
notmatched =0
not_checked =0
embedding_matrix_50 = np.zeros((vocab_size, embedding_dim_50))
embedding_matrix_100 = np.zeros((vocab_size, embedding_dim_100))
embedding_matrix_200 = np.zeros((vocab_size, embedding_dim_200))
embedding_matrix_300 = np.zeros((vocab_size, embedding_dim_300))

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index_50.get(word)
        if embedding_vector is not None:
            matched = matched+1
            embedding_matrix_50[i] = embedding_vector
        else :
             notmatched = notmatched+1
             embedding_matrix_50[i] = np.zeros(embedding_dim_50)                     
    else :
        not_checked = not_checked+1
        
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index_100.get(word)
        if embedding_vector is not None:
            embedding_matrix_100[i] = embedding_vector
        else :
             embedding_matrix_100[i] = np.zeros(embedding_dim_100)   

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index_200.get(word)
        if embedding_vector is not None:
            embedding_matrix_200[i] = embedding_vector
        else :
             embedding_matrix_200[i] = np.zeros(embedding_dim_200)  

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index_300.get(word)
        if embedding_vector is not None:
            embedding_matrix_300[i] = embedding_vector
        else :
             embedding_matrix_300[i] = np.zeros(embedding_dim_300)   

print("matched : ", matched)
print("notmatched : ",notmatched)
print("not checked : ", not_checked)

In [None]:
# Build the CNN model with GloVe embeddings
model_50 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim_50, input_length=max_length,
              weights=[embedding_matrix_50], trainable=False),  # Use pre-trained GloVe embeddings
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

model_100 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim_100, input_length=max_length,
              weights=[embedding_matrix_100], trainable=False),  # Use pre-trained GloVe embeddings
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

model_200 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim_200, input_length=max_length,
              weights=[embedding_matrix_200], trainable=False),  # Use pre-trained GloVe embeddings
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

model_300 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim_300, input_length=max_length,
              weights=[embedding_matrix_300], trainable=False),  # Use pre-trained GloVe embeddings
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multi-class classification
])

In [None]:
# Build the model explicitly
model_50.build(input_shape=(None, max_length))
model_100.build(input_shape=(None, max_length))
model_200.build(input_shape=(None, max_length))
model_300.build(input_shape=(None, max_length))

In [None]:
# Summary of the model
model_50.summary()
model_100.summary()
model_200.summary()
model_300.summary()

In [None]:
# Compile the model
model_50.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_100.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_200.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_300.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
# Measure memory before training
process = psutil.Process()
memory_before = process.memory_info().rss / (1024 * 1024)  # Convert to MB
start_time = time.time()
history_50 = model_50.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))
end_time = time.time()
memory_after = process.memory_info().rss / (1024 * 1024)  # Convert to MB
memory_used_50 = memory_after - memory_before
train_time_50 = end_time - start_time

print("\n#######GloVe 50-D#######")
print("train time : ",train_time_50,"Seconds")
print("memory_used : ", memory_used_50," MB")
print("\n")


memory_before = process.memory_info().rss / (1024 * 1024)  # Convert to MB
start_time = time.time()
history_100 = model_100.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))
end_time = time.time()
memory_after = process.memory_info().rss / (1024 * 1024)  # Convert to MB
memory_used_100 = memory_after - memory_before
train_time_100 = end_time - start_time

print("\n#######GloVe 100-D#######")
print("train time : ",train_time_100,"Seconds")
print("memory_used : ", memory_used_100," MB")
print("\n")


memory_before = process.memory_info().rss / (1024 * 1024)  # Convert to MB
start_time = time.time()
history_200 = model_200.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))
end_time = time.time()
memory_after = process.memory_info().rss / (1024 * 1024)  # Convert to MB
memory_used_200 = memory_after - memory_before
train_time_200 = end_time - start_time

print("\n#######GloVe 200-D#######")
print("tran time : ",train_time_200,"Seconds")
print("memory_used : ", memory_used_200," MB")
print("\n")


memory_before = process.memory_info().rss / (1024 * 1024)  # Convert to MB
start_time = time.time()
history_300 = model_300.fit(X_train, y_train, epochs=5, batch_size=128, validation_data=(X_test, y_test))
end_time = time.time()
memory_after = process.memory_info().rss / (1024 * 1024)  # Convert to MB
memory_used_300 = memory_after - memory_before
train_time_300 = end_time - start_time

print("\n#######GloVe 300-D#######")
print("tran time : ",train_time_300,"Seconds")
print("memory_used : ", memory_used_300," MB")
print("\n")


In [None]:
# Plot training and validation accuracy
plt.title("GloVe 50-D")
plt.plot(history_50.history["accuracy"], label="Training Accuracy")
plt.plot(history_50.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.title("GloVe 100-D")
plt.plot(history_100.history["accuracy"], label="Training Accuracy")
plt.plot(history_100.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


# Plot training and validation accuracy
plt.title("GloVe 200-D")
plt.plot(history_200.history["accuracy"], label="Training Accuracy")
plt.plot(history_200.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

# Plot training and validation accuracy
plt.title("GloVe 300-D")
plt.plot(history_300.history["accuracy"], label="Training Accuracy")
plt.plot(history_300.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
# Evaluate the model
loss_50, accuracy_50 = model_50.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy_50:.4f}")

loss_100, accuracy_100 = model_100.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy_100:.4f}")

loss_200, accuracy_200 = model_200.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy_200:.4f}")

loss_300, accuracy_300 = model_300.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy_300:.4f}")


In [None]:
# Classification report
y_pred_50 = model_50.predict(X_test)
y_pred_50 = np.argmax(y_pred_50, axis=1)
print("\nClassification Report ( GloVe -50D) :")
print(classification_report(y_test, y_pred_50, target_names=["World", "Sports", "Business", "Sci/Tech"]))
print("\n")

y_pred_100 = model_100.predict(X_test)
y_pred_100 = np.argmax(y_pred_100, axis=1)
print("\nClassification Report ( GloVe -100D) :")
print(classification_report(y_test, y_pred_100, target_names=["World", "Sports", "Business", "Sci/Tech"]))
print("\n")

y_pred_200 = model_200.predict(X_test)
y_pred_200 = np.argmax(y_pred_200, axis=1)
print("\nClassification Report ( GloVe -200D) :")
print(classification_report(y_test, y_pred_200, target_names=["World", "Sports", "Business", "Sci/Tech"]))
print("\n")

y_pred_300 = model_300.predict(X_test)
y_pred_300 = np.argmax(y_pred_300, axis=1)
print("\nClassification Report ( GloVe -300D) :")
print(classification_report(y_test, y_pred_300, target_names=["World", "Sports", "Business", "Sci/Tech"]))
print("\n")

In [None]:
confusion_mat_50 =  metrics.confusion_matrix(y_test, y_pred_50)
individual_class_acc_50   =  confusion_mat_50.diagonal()/confusion_mat_50.sum(axis = 1)
print("\nIndividual class Accuracy 50-D : \n", individual_class_acc_50 )

confusion_mat_100 =  metrics.confusion_matrix(y_test, y_pred_100)
individual_class_acc_100   =  confusion_mat_100.diagonal()/confusion_mat_100.sum(axis = 1)
print("\nIndividual class Accuracy 100-D : \n", individual_class_acc_100 )

confusion_mat_200 =  metrics.confusion_matrix(y_test, y_pred_200)
individual_class_acc_200   =  confusion_mat_200.diagonal()/confusion_mat_200.sum(axis = 1)
print("\nIndividual class Accuracy 200-D : \n", individual_class_acc_200 )

confusion_mat_300 =  metrics.confusion_matrix(y_test, y_pred_300)
individual_class_acc_300   =  confusion_mat_50.diagonal()/confusion_mat_300.sum(axis = 1)
print("\nIndividual class Accuracy 300-D : \n", individual_class_acc_300 )
        


In [None]:
plt.figure(figsize=(15, 8))
plt.title('Class Wise Accuracy 50-D')
plt.bar(['World', 'Sports', 'Business', 'Sci/Tech'], individual_class_acc_50)
plt.xlabel('News')
plt.ylabel('Accuracy')
plt.show()

plt.figure(figsize=(15, 8))
plt.title('Class Wise Accuracy 100-D')
plt.bar(['World', 'Sports', 'Business', 'Sci/Tech'], individual_class_acc_100)
plt.xlabel('News')
plt.ylabel('Accuracy')
plt.show()

plt.figure(figsize=(15, 8))
plt.title('Class Wise Accuracy 200-D')
plt.bar(['World', 'Sports', 'Business', 'Sci/Tech'], individual_class_acc_200)
plt.xlabel('News')
plt.ylabel('Accuracy')
plt.show()

plt.figure(figsize=(15, 8))
plt.title('Class Wise Accuracy 300-D')
plt.bar(['World', 'Sports', 'Business', 'Sci/Tech'], individual_class_acc_300)
plt.xlabel('News')
plt.ylabel('Accuracy')
plt.show()


In [None]:
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_mat_50,display_labels= ['World', 'Sports', 'Business', 'Sci/Tech'])
disp.plot()
plt.title('Confusion Matrix 50-D');
plt.show() 

disp = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_mat_100,display_labels= ['World', 'Sports', 'Business', 'Sci/Tech'])
disp.plot()
plt.title('Confusion Matrix 100-D');
plt.show() 

disp = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_mat_200,display_labels= ['World', 'Sports', 'Business', 'Sci/Tech'])
disp.plot()
plt.title('Confusion Matrix 200-D');
plt.show() 

disp = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_mat_300,display_labels= ['World', 'Sports', 'Business', 'Sci/Tech'])
disp.plot()
plt.title('Confusion Matrix 300-D');
plt.show() 

In [None]:
# Save the model 50-D
model_50.save('ag_news_cnn_glove_model_50D.h5')
print("Model 50-D saved!")

# Save the model 100-D
model_100.save('ag_news_cnn_glove_model_100D.h5')
print("Model 100-D saved!")

# Save the model 200-D
model_200.save('ag_news_cnn_glove_model_200D.h5')
print("Model 200-D saved!")

# Save the model 300-D
model_300.save('ag_news_cnn_glove_model_300D.h5')
print("Model 300-D saved!")

# Save the tokenizer (for reuse during inference)
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
print("Tokenizer saved!")
    

    