<a href="https://colab.research.google.com/github/ashaduzzaman-sarker/Text-classification-Sentiment-Analysis/blob/main/Text_classification_on_the_Newsgroup20_dataset_using_pre_trained_GloVe_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification on the Newsgroup20 dataset using pre-trained GloVe word embeddings

## Introduction

This example shows how to train a text classification model using pre-trained word embeddings [GloVe embeddings](https://nlp.stanford.edu/projects/glove/).

# Imports

In [1]:
!pip install --upgrade keras tensorflow



In [2]:
import os

# Tensorflow backend only supports string inputs
os.environ['KERAS_BACKEND'] = 'tensorflow'

import pathlib
import numpy as np
import tensorflow.data as tf_data
import keras
from keras import layers

## Download the Newsgroup20 Dataset

**The Newsgroup20 dataset :**

Set of 20,000 message board messages belonging to 20 different topic categories.


In [3]:
data_path = keras.utils.get_file(
    'news20.tar.gz',
    'http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz',
    untar = True
)

Downloading data from http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz
[1m17329808/17329808[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


## Let's take a look at the Dataset

In [4]:
data_dir = pathlib.Path(data_path).parent / '20_newsgroup'
dirnames = os.listdir(data_dir)
print('Number of directories:', len(dirnames))
print('Directory names:', dirnames)

fnames = os.listdir(data_dir / 'comp.graphics')
print('Number of files in comp.graphics:', len(fnames))
print('Some example filenames:', fnames[:5])

Number of directories: 20
Directory names: ['soc.religion.christian', 'sci.crypt', 'comp.sys.ibm.pc.hardware', 'rec.sport.hockey', 'talk.religion.misc', 'talk.politics.misc', 'talk.politics.guns', 'rec.autos', 'sci.space', 'comp.windows.x', 'comp.os.ms-windows.misc', 'comp.graphics', 'sci.electronics', 'misc.forsale', 'comp.sys.mac.hardware', 'rec.motorcycles', 'rec.sport.baseball', 'sci.med', 'talk.politics.mideast', 'alt.atheism']
Number of files in comp.graphics: 1000
Some example filenames: ['37956', '39626', '38720', '38874', '38837']


In [5]:
# Example of what one file contains
print(open(data_dir / 'comp.graphics' / '38360').read())

Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!bb3.andrew.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!zaphod.mps.ohio-state.edu!darwin.sura.net!haven.umd.edu!uunet!newsgate.watson.ibm.com!yktnews.watson.ibm.com!cliff
From: cliff@watson.ibm.com (cliff)
Subject: Reprints
Sender: news@watson.ibm.com (NNTP News Poster)
Message-ID: <C5LH05.Dv1@watson.ibm.com>
Date: Fri, 16 Apr 1993 21:00:05 GMT
Disclaimer: This posting represents the poster's views, not necessarily those of IBM.
Nntp-Posting-Host: cliff.watson.ibm.com
Organization: A
Lines: 17

I have a few reprints left of chapters from my book "Visions of the             
Future".  These include reprints of 3 chapters probably of interest to          
readers of this forum, including:                                               
                                                                                
1. Current Techniques and Development of Computer Art, by Franz Szabo           
             

In [6]:
# Let's get rid of the headers
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print('Processing %s, %d, files found' % (dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding = 'latin-1')
        content = f.read()
        lines = content.split('\n')
        lines = lines[10:]
        content = '\n'.join(lines)
        samples.append(content)
        labels.append(class_index)
    class_index += 1

print('Classes:', class_names)
print('Number of samples:', len(samples))

Processing alt.atheism, 1000, files found
Processing comp.graphics, 1000, files found
Processing comp.os.ms-windows.misc, 1000, files found
Processing comp.sys.ibm.pc.hardware, 1000, files found
Processing comp.sys.mac.hardware, 1000, files found
Processing comp.windows.x, 1000, files found
Processing misc.forsale, 1000, files found
Processing rec.autos, 1000, files found
Processing rec.motorcycles, 1000, files found
Processing rec.sport.baseball, 1000, files found
Processing rec.sport.hockey, 1000, files found
Processing sci.crypt, 1000, files found
Processing sci.electronics, 1000, files found
Processing sci.med, 1000, files found
Processing sci.space, 1000, files found
Processing soc.religion.christian, 997, files found
Processing talk.politics.guns, 1000, files found
Processing talk.politics.mideast, 1000, files found
Processing talk.politics.misc, 1000, files found
Processing talk.religion.misc, 1000, files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',

## Data Preprocessing

In [7]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

In [8]:
# Extract a training & validation split
validation_split = 0.2
num_validation_samples = int(validation_split * len(samples))
train_samples = samples[:-num_validation_samples]
val_samples = samples[-num_validation_samples:]
train_labels = labels[:-num_validation_samples]
val_labels = labels[-num_validation_samples:]

## Create a vocabulary index

- Index vocabulary in the dataset using `TextVectorization`
- In this example layer top 20k words are used so the truncate or pad sequences will be 200 tokens long

In [9]:
vectorizer = layers.TextVectorization(
    max_tokens = 20000,
    output_sequence_length = 200
)
text_ds = tf_data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

In [10]:
# Let's look at few examples from the vocabulary
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'of']

In [11]:
# Vectorize a test example
output = vectorizer([
    'the cat sat on the mat'
])

output.numpy()[0, :6]

array([   2, 3842, 1745,   15,    2, 7690])

- Here `index 0` is reserved for padding &
- `index 1` is reserved for "out of vocabulary" tokens

In [12]:
# Dict mapping words to their indices, handling potential key errors
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [13]:
test = ['the','cat','sat','on','the','mat']
result = [word_index[w] for w in test]
print(result)

[2, 3842, 1745, 15, 2, 7690]


As we can see, we obtain the same encoding as above for our test sentence

## Load pre-trained GloVe word embeddings

In [14]:
!wget http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

--2024-08-01 03:58:33--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2024-08-01 04:01:14 (5.12 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [15]:
## From the dataset '100D text-encoded vectors' will be used here
path_to_glove_file = 'glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit = 1)
        coefs = np.fromstring(coefs, 'f', sep = ' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


## Prepare embedding layer

In [16]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all zeros
        # This includes the representation for 'padding' and 'OVV'
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print('Converted %d words (%d misses)' % (hits, misses))

Converted 17975 words (2025 misses)


In [17]:
# Load the pre-trained word embeddings matrix into an Embedding layer
from keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    trainable = False,
)

embedding_layer.build((1,))
embedding_layer.set_weights([embedding_matrix])

## Build the Model

In [18]:
int_sequences_input = layers.Input(shape = (None,), dtype = 'int32')
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation = 'relu')(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation = 'relu')(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation = 'relu')(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation = 'relu')(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense(len(class_names), activation = 'softmax')(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

## Train the Model

In [19]:
# Convert list of strings to right padded NumPy arrays of integer indices
x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()

y_train = np.array(train_labels)
y_val = np.array(val_labels)

In [23]:
model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = 'rmsprop',
    metrics = ['acc']
)

model.fit(
    x_train,
    y_train,
    batch_size = 128,
    epochs = 20,
    validation_data = (x_val, y_val)
)

Epoch 1/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 347ms/step - acc: 0.9253 - loss: 0.2556 - val_acc: 0.6972 - val_loss: 1.4424
Epoch 2/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 345ms/step - acc: 0.9492 - loss: 0.1549 - val_acc: 0.7087 - val_loss: 1.3922
Epoch 3/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 329ms/step - acc: 0.9524 - loss: 0.1413 - val_acc: 0.6839 - val_loss: 1.5839
Epoch 4/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 323ms/step - acc: 0.9488 - loss: 0.1567 - val_acc: 0.7027 - val_loss: 1.4116
Epoch 5/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 289ms/step - acc: 0.9528 - loss: 0.1451 - val_acc: 0.7044 - val_loss: 1.5376
Epoch 6/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 338ms/step - acc: 0.9572 - loss: 0.1222 - val_acc: 0.6672 - val_loss: 1.9864
Epoch 7/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7bd2a40b2350>

## Export an End-to-End Model

In [24]:
# Model takes input as a string of arbitrary length
string_input = keras.Input(shape = (1,), dtype = 'string')
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    keras.ops.convert_to_tensor([
        'the cat sat on the mat',
        'the dog ate my homework'
    ])
)

print(class_names[np.argmax(probabilities[0])])
print(class_names[np.argmax(probabilities[1])])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432ms/step
comp.windows.x
sci.med
