In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from tensorflow.keras.layers import TextVectorization




In [2]:
#reads a file. Each line has the format: label text
#Returns a list with the text and a list with the labels
def readData(fname):

    with open(fname, 'r', encoding="utf-8") as f:
        fileData = f.read()
  
    lines = fileData.split("\n")
    textData = list()
    textLabel = list()
    lineLength = np.zeros(len(lines))
    
    for i, aLine in enumerate(lines):     
        if not aLine:
            break  
        label = aLine.split(" ")[0]
        lineLength[i] = len(aLine.split(" "))
        if(label == "__label__1"):
            textLabel.append(0)
            textData.append(aLine.removeprefix("__label__1 "))

        elif(label == "__label__2"):
            textLabel.append(1)
            textData.append(aLine.removeprefix("__label__2 "))

        else:
            print("\nError in readData: ", i, aLine)
            exit()
    
    f.close()
    return textData, textLabel, int(np.average(lineLength)+2*np.std(lineLength))

In [3]:
x_train, y_train, seqLength = readData("amazon/train_small.txt")
x_test, y_test, _ = readData("amazon/test_small.txt")

In [6]:
def count_unique_words(texts):
    unique_words = set()
    for text in texts:
        words = text.split()
        unique_words.update(words)
    return unique_words

# Collect unique words from both train and test
unique_words_train = count_unique_words(x_train)
unique_words_test = count_unique_words(x_test)
print(len(unique_words_train))
print(len(unique_words_test))
# Combine both sets to get the total unique vocabulary
total_unique_words = unique_words_train.union(unique_words_test)

print("Total number of unique words in dataset:", len(total_unique_words))


147917
159418
Total number of unique words in dataset: 248520


In [8]:
from collections import Counter

def word_frequencies(texts):
    counter = Counter()
    for text in texts:
        words = text.split()
        counter.update(words)
    return counter

# count words
freq_train = word_frequencies(x_train)
freq_test = word_frequencies(x_test)

total_freq = freq_train + freq_test

# show more frequent words
print("Top 10000:")
for word, freq in total_freq.most_common(10000):
    print(f"{word}: {freq}")



Top 10000:
the: 172436
and: 103741
a: 94816
I: 93671
to: 93056
of: 82640
is: 69845
this: 54780
it: 51778
in: 44297
for: 38243
that: 37762
was: 34741
with: 26785
you: 26241
on: 25161
The: 25032
not: 24849
but: 24765
have: 24336
book: 21564
are: 20956
my: 20341
as: 20049
be: 17360
This: 17209
one: 14742
like: 13485
so: 13000
It: 12614
all: 12386
at: 12351
from: 12027
very: 11882
just: 11760
about: 11155
an: 11130
or: 11077
has: 11001
would: 10868
they: 10711
good: 10162
by: 9659
will: 9379
had: 9346
read: 9262
out: 9077
more: 9020
great: 8984
his: 8959
movie: 8882
if: 8733
get: 8477
what: 8161
can: 8075
only: 8064
me: 7888
your: 7876
really: 7612
when: 7545
who: 7438
than: 7403
up: 7358
some: 7356
he: 7092
no: 6932
it.: 6881
A: 6816
because: 6426
other: 6351
even: 6127
much: 6102
her: 6071
time: 6032
first: 6010
were: 5949
don't: 5933
been: 5831
it's: 5702
-: 5626
do: 5583
If: 5534
i: 5529
there: 5258
love: 5209
their: 5208
how: 5132
which: 5130
any: 5075
could: 4945
am: 4926
think: 4817

In [10]:
for t in [25, 50, 75, 100, 500, 1000, 2000, 5000]:
    count = sum(1 for word, freq in total_freq.items() if freq > t)
    print(f"Words with frequency > {t}: {count}")


Words with frequency > 25: 9361
Words with frequency > 50: 5383
Words with frequency > 75: 3827
Words with frequency > 100: 2993
Words with frequency > 500: 723
Words with frequency > 1000: 400
Words with frequency > 2000: 208
Words with frequency > 5000: 89
