In [20]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [21]:
data_path = keras.utils.get_file(
    "news20.tar.gz",
    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
    untar=True,
)

In [22]:
import os
import pathlib

data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
dirnames = os.listdir(data_dir)
print("Number of directories:", len(dirnames))
print("Directory names:", dirnames)

Number of directories: 20
Directory names: ['comp.graphics', 'rec.sport.hockey', 'soc.religion.christian', 'talk.politics.guns', 'talk.religion.misc', 'rec.sport.baseball', 'rec.motorcycles', 'sci.med', 'comp.sys.ibm.pc.hardware', 'talk.politics.mideast', 'talk.politics.misc', 'alt.atheism', 'sci.crypt', 'comp.os.ms-windows.misc', 'rec.autos', 'misc.forsale', 'comp.sys.mac.hardware', 'sci.space', 'comp.windows.x', 'sci.electronics']


In [23]:
fnames = os.listdir(data_dir / "comp.graphics")
print("Number of files in comp.graphics:", len(fnames))
print("Some example filenames:", fnames[:5])

Number of files in comp.graphics: 1000
Some example filenames: ['38810', '38454', '38433', '37948', '38380']


In [24]:
print(open(data_dir / "comp.graphics" / "38987").read())


Newsgroups: comp.graphics
Path: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!howland.reston.ans.net!agate!dog.ee.lbl.gov!network.ucsd.edu!usc!rpi!nason110.its.rpi.edu!mabusj
From: mabusj@nason110.its.rpi.edu (Jasen M. Mabus)
Subject: Looking for Brain in CAD
Message-ID: <c285m+p@rpi.edu>
Nntp-Posting-Host: nason110.its.rpi.edu
Reply-To: mabusj@rpi.edu
Organization: Rensselaer Polytechnic Institute, Troy, NY.
Date: Thu, 29 Apr 1993 23:27:20 GMT
Lines: 7

Jasen Mabus
RPI student

	I am looking for a hman brain in any CAD (.dxf,.cad,.iges,.cgm,etc.) or picture (.gif,.jpg,.ras,etc.) format for an animation demonstration. If any has or knows of a location please reply by e-mail to mabusj@rpi.edu.

Thank you in advance,
Jasen Mabus  



In [25]:
import re
import string

def clean_text(text):
    text = text.lower()
    #text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('[''’“”_]', '', text)
    text = re.sub('\d', '', text)
    return text

In [26]:
samples = []
labels = []
class_names = []
class_index = 0
for dirname in sorted(os.listdir(data_dir)):
    class_names.append(dirname)
    dirpath = data_dir / dirname
    fnames = os.listdir(dirpath)
    print("Precessing {}, {} files found".format(dirname, len(fnames)))
    for fname in fnames:
        fpath = dirpath / fname
        f = open(fpath, encoding="latin-1")
        content = f.read()
        lines = content.split("\n")
        lines = lines[10:]
        content = "\n".join(lines)
        content = clean_text(content)
        samples.append(content)
        labels.append(class_index)
    class_index += 1
    
print("Classes:", class_names)
print("Number of samples:", len(samples))

Precessing alt.atheism, 1000 files found
Precessing comp.graphics, 1000 files found
Precessing comp.os.ms-windows.misc, 1000 files found
Precessing comp.sys.ibm.pc.hardware, 1000 files found
Precessing comp.sys.mac.hardware, 1000 files found
Precessing comp.windows.x, 1000 files found
Precessing misc.forsale, 1000 files found
Precessing rec.autos, 1000 files found
Precessing rec.motorcycles, 1000 files found
Precessing rec.sport.baseball, 1000 files found
Precessing rec.sport.hockey, 1000 files found
Precessing sci.crypt, 1000 files found
Precessing sci.electronics, 1000 files found
Precessing sci.med, 1000 files found
Precessing sci.space, 1000 files found
Precessing soc.religion.christian, 997 files found
Precessing talk.politics.guns, 1000 files found
Precessing talk.politics.mideast, 1000 files found
Precessing talk.politics.misc, 1000 files found
Precessing talk.religion.misc, 1000 files found
Classes: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.ha

In [27]:
seed = 123
rng = np.random.RandomState(seed)
rng.shuffle(samples)
rng = np.random.RandomState(seed)
rng.shuffle(labels)

In [28]:
train_split = 0.85
num_train_samples = int(train_split * len(samples))
train_X, train_Y = samples[:num_train_samples], labels[:num_train_samples]
val_X, val_Y = samples[num_train_samples:], labels[num_train_samples:]
print("train_X {}, train_Y {}".format(len(train_X), len(train_Y)))
print("val_X {}, val_Y {}".format(len(val_X), len(val_Y)))

train_X 16997, train_Y 16997
val_X 3000, val_Y 3000


In [29]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_X).batch(128)
vectorizer.adapt(text_ds)

In [30]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'to', 'of']

In [34]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]


array([   2, 3229, 1906,   18,    2, 5308])

In [32]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [33]:
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test] 

[2, 3229, 1906, 18, 2, 5308]

### Load pre-trained word embeddings GloVe

In [36]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

OSError: [Errno 12] Cannot allocate memory