<h2> Installing necessary libraried </h2>

In [4]:
!unzip '/content/archive.zip'

Archive:  /content/archive.zip
  inflating: test.csv                
  inflating: train.csv               


In [1]:
!!pip3 install transformers

['Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/',
 'Collecting transformers',
 '  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)',
 '\x1b[?25l     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.0/6.3 MB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b[0m',
 '\x1b[2K     \x1b[91m━━\x1b[0m\x1b[90m╺\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.3/6.3 MB\x1b[0m \x1b[31m9.3 MB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[91m━━━━━━━━━━━━━━━━━━━\x1b[0m\x1b[90m╺\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m3.1/6.3 MB\x1b[0m \x1b[31m44.7 MB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m\x1b[91m╸\x1b[0m \x1b[32m6.3/6.3 MB\x1b[0m \x1b[31m72.4 MB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m6.3/6.3 MB\x1b[0m \x1b[31m54.5 MB/s\x1b[0m eta \x1b[36m0:00

<h2> Importing necessary libraries </h2>

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.initializers import GlorotUniform
import pandas as pd
import numpy as np
import re

<h2> Read the data </h2>

In [5]:
# Load the AG News data
train_df = pd.read_csv('train.csv', header=None, names=['label', 'title', 'description'])
test_df = pd.read_csv('test.csv', header=None, names=['label', 'title', 'description'])

In [6]:
train_df.head()

Unnamed: 0,label,title,description
0,Class Index,Title,Description
1,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
2,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
3,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
4,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...


In [7]:
train_df = train_df.iloc[1: , :]
test_df = test_df.iloc[1: , :]
train_df['label'].unique()

array(['3', '4', '2', '1'], dtype=object)

In [8]:
train_df['label'] = train_df['label'].replace(['1', '2', '3', '4'], ['0', '1', '2', '3'])
train_df['label'].unique()
test_df['label'] = test_df['label'].replace(['1', '2', '3', '4'], ['0', '1', '2', '3'])
test_df['label'].unique()

array(['2', '3', '1', '0'], dtype=object)

In [9]:
TEXT_LABELS = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tech"}

In [10]:
# Check the class distribution
train_df.groupby('label').count()

Unnamed: 0_level_0,title,description
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,30000,30000
1,30000,30000
2,30000,30000
3,30000,30000


<h2>1. Data preprocessing </h2>

In [11]:
# Define a function to preprocess the data
def preprocess_data(text):
    # Remove non-alphanumeric characters
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    # Lowercase the text
    text = text.lower()
    return text



# Preprocess the data
train_df['text'] = train_df['title'] + ' ' + train_df['description']
test_df['text'] = test_df['title'] + ' ' + test_df['description']
train_df['text'] = train_df['text'].apply(preprocess_data)
test_df['text'] = test_df['text'].apply(preprocess_data)


In [12]:
train_df.shape, test_df.shape

((120000, 4), (7600, 4))

<h2> Creating BERT model </h2>

In [13]:
def bert_encode(texts, tokenizer, max_len=512):
    """Encode the text using the BERT tokenizer"""
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


In [14]:
# Load the pre-trained BERT model from TensorFlow Hub
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_seq_length = 80

input_word_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])  #pooled_output passed to neural netowrk head.



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [15]:
# Convert the labels to one-hot encoded vectors
num_classes = len(train_df['label'].unique())


train_labels = tf.keras.utils.to_categorical(train_df['label'], num_classes=num_classes)

<h2> Training a Neural Network with 768 features </h2>

In [16]:
tf.keras.backend.clear_session() 

# Define the neural network model

layer1 = Dense(32, activation='relu', kernel_initializer='he_normal')(pooled_output)
layer2 = Dense(128, activation='relu', kernel_initializer=GlorotUniform(seed=42))(layer1)
layer3 = Dense(128, activation='relu', kernel_initializer=GlorotUniform(seed=42))(layer2)
output = Dense(num_classes, activation='softmax')(layer3)
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=output)

# Compile the model for fine-tuning
initial_learning_rate = 1e-5
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=10000,
    decay_rate=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=initial_learning_rate, epsilon=1e-08)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

print(model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 80)]         0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 80)]         0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 80)]         0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 80, 768)]                 'input_mask[0][0]',         

In [17]:
# Convert the training and test data to BERT inputs
train_input = bert_encode(train_df['text'].values, tokenizer, max_seq_length)
test_input = bert_encode(test_df['text'].values, tokenizer, max_seq_length)

In [18]:
# Train the model
history = model.fit(train_input, train_labels, validation_split=0.2, epochs=2, batch_size=32)

Epoch 1/2


  output, from_logits = _get_logits(


Epoch 2/2


<h2>Evaluate the model on the test data </h2>

In [19]:
# Evaluate the model on the test data
test_labels = tf.keras.utils.to_categorical(test_df['label'], num_classes=num_classes)
test_loss, test_accuracy = model.evaluate(test_input, test_labels)
print('Test loss:', test_loss)
print('Test accuracy:', test_accuracy)

Test loss: 0.1917852908372879
Test accuracy: 0.9360526204109192


<h2>Using trained model to give prediction on test data </h2>

In [20]:
#Predict on test data
y_pred = model.predict(test_input)

# Convert predicted probabilities to predicted class labels
y_pred_labels = np.argmax(y_pred, axis=1)

# Print predicted labels for first 10 samples in test set
print(y_pred_labels[:10])

[2 3 3 3 3 3 3 3 3 3]


<h2> Model predictions with explanation </h2>
To explain the prediction made by the model, we can look at the predicted class label and the corresponding probability score for that class. The probability score represents the confidence of the model in its prediction.

For example, if the predicted class label is "Science/Technology" and the corresponding probability score is 0.99, it means that the model is 99% confident that the input text belongs to the "Science/Technology" category.

In [64]:
TEXT_LABELS = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

for i in range(5):
  max_probability = max(y_pred[i])
  class_index = np.where(y_pred[i] == max_probability)
  class_index = class_index[0][0]
  predicted_class = TEXT_LABELS[class_index]

  actual_class = test_df['label'].values[i]
  print("For text ==>", test_df['text'].values[i])
  print("Actual class is {0}, that is, '{1}' whereas predicted class is '{2}'".format(actual_class,TEXT_LABELS[int(actual_class)], predicted_class))
  
  #print()
  percent=np.round(max_probability,2)
  print("This is because the model ", percent, '% is confident in its prediction that the text belongs to class ',predicted_class,)
  print()

For text ==> fears for t n pension after talks unions representing workers at turner newall say they are disappointed after talks with stricken parent firm federal mogul 
Actual class is 2, that is, 'Business' whereas predicted class is 'Business'
This is because the model  1.0 % is confident in its prediction that the text belongs to class  Business

For text ==> the race is on second private team sets launch date for human spaceflight space com space com toronto canada a second team of rocketeers competing for the 36 10 million ansari x prize a contest for privately funded suborbital space flight has officially announced the first launch date for its manned rocket 
Actual class is 3, that is, 'Sci/Tech' whereas predicted class is 'Sci/Tech'
This is because the model  0.99 % is confident in its prediction that the text belongs to class  Sci/Tech

For text ==> ky company wins grant to study peptides ap ap a company founded by a chemistry researcher at the university of louisville won a