In [1]:
import pandas as pd
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from langdetect import detect

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Dataset Preparation
data = pd.read_csv('train.csv')

# Map categories to integers
category_mapping = {'Politics': 0, 'Sports': 1, 'Media': 2, 'Market & Economy': 3, 'STEM': 4}
data['Category'] = data['Category'].replace(category_mapping)



max_length = 128

print("Maximum number of strings in 'Discussion' column:", max_length)

Maximum number of strings in 'Discussion' column: 128


In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from langdetect import detect, DetectorFactory
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import tensorflow as tf

# Set seed for consistent language detection results
DetectorFactory.seed = 0

# Load data
data = pd.read_csv('train.csv')

# Map categories to integers
category_mapping = {'Politics': 0, 'Sports': 1, 'Media': 2, 'Market & Economy': 3, 'STEM': 4}
data['Category'] = data['Category'].replace(category_mapping)

# Remove rows with NaN or empty strings in 'Discussion'
data.dropna(subset=['Discussion'], inplace=True)  # Remove NaN values
data['Discussion'] = data['Discussion'].astype(str)  # Ensure all values are strings
data = data[data['Discussion'].str.strip() != '']  # Remove empty or whitespace-only strings

# Define a safe language detection function
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False  # Return False for texts that cause an exception

# Filter rows where 'Discussion' is detected as English
data = data[data['Discussion'].apply(is_english)]

# Remove short discussions (noise) based on word count
min_word_count = 3  # Minimum number of words
data = data[data['Discussion'].apply(lambda x: len(x.split()) >= min_word_count)]

# Normalize text
data['Discussion'] = data['Discussion'].str.lower()
data['Discussion'] = data['Discussion'].apply(lambda x: re.sub(r'http[s]?://\S+', '', x))  # Remove URLs
data['Discussion'] = data['Discussion'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # Remove non-alphanumeric

# Tokenize, remove stop words, and lemmatize
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text)
    filtered_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(filtered_words)

data['Discussion'] = data['Discussion'].apply(preprocess_text)



In [4]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['Discussion'], data['Category'], test_size=0.25, stratify=data['Category'], shuffle=True
)


max_length = 128


In [5]:
# Tokenization
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def tokenize_texts(texts, tokenizer, max_length=max_length):
    tokenized = tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )
    return tokenized['input_ids']

X_train_tokenized = tokenize_texts(X_train, tokenizer, max_length=max_length)
X_test_tokenized = tokenize_texts(X_test, tokenizer, max_length=max_length)

# One-hot encode labels
num_classes = len(category_mapping)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)



In [6]:
X_train_tokenized

<tf.Tensor: shape=(16840, 128), dtype=int32, numpy=
array([[  101,  2215, 10463, ...,     0,     0,     0],
       [  101,  3160,  2691, ...,     0,     0,     0],
       [  101,  2755,  2438, ...,  2008,  2072,   102],
       ...,
       [  101,  2377,  6708, ...,     0,     0,     0],
       [  101, 10047,  2469, ...,     0,     0,     0],
       [  101,  2947,  4824, ...,     0,     0,     0]])>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# Positional Encoding Function
def positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    angle_rads = pos * angle_rates
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])  # Apply sin to even indices
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])  # Apply cos to odd indices
    return tf.cast(angle_rads, dtype=tf.float32)

# Transformer Encoder Block
def transformer_encoder(inputs, d_model, num_heads, ffn_units, dropout_rate, l2_reg):
    # Multi-head self-attention
    attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)(inputs, inputs)
    attention_output = layers.Dropout(dropout_rate)(attention_output)
    attention_output = layers.LayerNormalization(epsilon=1e-6)(inputs + attention_output)

    # Feedforward network
    ffn = layers.Dense(ffn_units, activation="relu", kernel_regularizer=l2(l2_reg))(attention_output)
    ffn = layers.Dense(d_model, kernel_regularizer=l2(l2_reg))(ffn)
    ffn_output = layers.Dropout(dropout_rate)(ffn)
    ffn_output = layers.LayerNormalization(epsilon=l2_reg)(attention_output + ffn_output)

    return ffn_output

# Transformer Model
def build_transformer_model(vocab_size, max_length, d_model, num_heads, ffn_units, num_classes, dropout_rate, l2_reg, num_layers):
    inputs = layers.Input(shape=(max_length,), dtype=tf.int32, name="inputs")
    
    # Embedding Layer
    embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
    embeddings = embedding_layer(inputs)  # Shape: [batch_size, max_length, d_model]
    
    # Positional Encoding
    position_encodings = positional_encoding(max_length, d_model)
    embeddings += position_encodings  # Add positional encoding
    
    # Stack multiple transformer encoder layers
    x = embeddings
    for _ in range(num_layers):
        x = transformer_encoder(x, d_model, num_heads, ffn_units, dropout_rate, l2_reg)
    
    # Global Average Pooling and Output
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout_rate)(x)
    outputs = layers.Dense(num_classes, activation="softmax", kernel_regularizer=l2(l2_reg))(x)

    model = Model(inputs=inputs, outputs=outputs, name="Custom_Transformer")
    return model


In [22]:

# Parameters
dropout_rate = 0.5
l2_reg = 1e-7
num_layers = 1


d_model = 48  # Set the embedding size
num_heads = 36
ffn_units = 64

num_classes = 5
vocab_size = 10000  # Assume a vocab size of 10,000 for this example

# Learning rate scheduler
learning_rate_schedule = ExponentialDecay(initial_learning_rate=1e-4, decay_steps=1000, decay_rate=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_schedule)
# Function to reinitialize weights
def reinitialize_weights(model):
    for layer in model.layers:
        if hasattr(layer, 'kernel_initializer') and hasattr(layer, 'bias_initializer'):
            layer.kernel.assign(layer.kernel_initializer(tf.shape(layer.kernel)))
            layer.bias.assign(layer.bias_initializer(tf.shape(layer.bias)))
        if hasattr(layer, 'gamma_initializer') and hasattr(layer, 'beta_initializer'):
            if layer.gamma is not None:
                layer.gamma.assign(layer.gamma_initializer(tf.shape(layer.gamma)))
            if layer.beta is not None:
                layer.beta.assign(layer.beta_initializer(tf.shape(layer.beta)))

# Build the model
custom_transformer_model = build_transformer_model(
    vocab_size, max_length, d_model, num_heads, ffn_units, num_classes, dropout_rate, l2_reg, num_layers
)

weights_before = [layer.get_weights() for layer in custom_transformer_model.layers]

# Reinitialize weights (optional, useful if weights were loaded previously)
def print_weights_summary(model, message):
    print(f"--- {message} ---")
    for layer in model.layers:
        if hasattr(layer, 'kernel'):
            print(f"Layer: {layer.name}")
            print(f"Kernel Mean: {tf.reduce_mean(layer.kernel).numpy()}, Std: {tf.math.reduce_std(layer.kernel).numpy()}")
        if hasattr(layer, 'bias'):
            print(f"Bias Mean: {tf.reduce_mean(layer.bias).numpy()}, Std: {tf.math.reduce_std(layer.bias).numpy()}")
print_weights_summary(custom_transformer_model, "Before Reinitialization")
reinitialize_weights(custom_transformer_model)
print_weights_summary(custom_transformer_model, "After Reinitialization")
weights_after = [layer.get_weights() for layer in custom_transformer_model.layers]


# Check if weights are different
weights_changed = any(
    not all(
        tf.reduce_all(tf.equal(w1, w2)).numpy()
        for w1, w2 in zip(layer_before, layer_after)
    )
    for layer_before, layer_after in zip(weights_before, weights_after)
)
print("Weights were reinitialized:", weights_changed)

# Compile the model
custom_transformer_model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

# Print model summary
custom_transformer_model.summary()


--- Before Reinitialization ---
Layer: dense_20
Kernel Mean: -0.0027202710043638945, Std: 0.13395832479000092
Bias Mean: 0.0, Std: 0.0
Layer: dense_21
Kernel Mean: -0.0002069069305434823, Std: 0.1332174390554428
Bias Mean: 0.0, Std: 0.0
Layer: dense_22
Kernel Mean: -0.0028308317996561527, Std: 0.1948351114988327
Bias Mean: 0.0, Std: 0.0
--- After Reinitialization ---
Layer: dense_20
Kernel Mean: 0.0043540578335523605, Std: 0.13398945331573486
Bias Mean: 0.0, Std: 0.0
Layer: dense_21
Kernel Mean: 0.0008082825806923211, Std: 0.13301114737987518
Bias Mean: 0.0, Std: 0.0
Layer: dense_22
Kernel Mean: 0.014705296605825424, Std: 0.19281302392482758
Bias Mean: 0.0, Std: 0.0
Weights were reinitialized: True
Model: "Custom_Transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inputs (InputLayer)             [(None, 128)]        0            

In [12]:


# # Build the model
# custom_transformer_model = build_transformer_model(
#     vocab_size, max_length, d_model, num_heads, ffn_units, num_classes, dropout_rate, l2_reg, num_layers
# )



# # Compile the model
# custom_transformer_model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

# # Print model summary
# custom_transformer_model.summary()

# # Early stopping callback
early_stopping_callback = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

In [23]:
# Train the model
history = custom_transformer_model.fit(
    X_train_tokenized.numpy(),
    y_train,
    validation_data=(X_test_tokenized.numpy(), y_test),
    batch_size=16,
    epochs=20,
    callbacks=[early_stopping_callback]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
