# Download the data

In [None]:
%%bash
wget -nc -O data/bbc_text_cls.csv https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [None]:
import nltk
import numpy as np
import pandas as pd
from nltk import word_tokenize as nltk_word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en import English

spacy_nlp = English()
spacy_word_tokenizer = spacy_nlp.tokenizer

In [None]:
nltk.download("punkt")

# Read data

In [None]:
df = pd.read_csv("data/bbc_text_cls.csv")
df.head()

# Tokenize the document and build Word -> Index mapping

In [None]:
nltk_tokens = nltk_word_tokenize("This is a test sentence.")
print(nltk_tokens)
spacy_tokens = spacy_word_tokenizer("This is a test sentence.")
print(len(spacy_tokens))
print([token.text for token in spacy_tokens])

In [None]:
def get_word2idx_mapping(docs, tokenizer):
    w2i = {}
    i2w = {}
    idx = 0
    docs_tokenized = []
    for doc in docs:
        doc_tokenized = []
        for token in tokenizer(doc.lower()):
            if hasattr(token, "text"):
                token = token.text

            if token not in w2i:
                w2i[token] = idx
                i2w[idx] = token
                idx += 1

            doc_tokenized.append(w2i[token])

        docs_tokenized.append(doc_tokenized)

    return w2i, i2w, docs_tokenized


w2i_nltk, i2w_nltk, docs_tok_nltk = get_word2idx_mapping(df["text"], nltk_word_tokenize)
w2i_spacy, i2w_spacy, docs_tok_spacy = get_word2idx_mapping(
    df["text"], spacy_word_tokenizer
)

print("Vocabulary size (nltk):", len(w2i_nltk))
print("Vocabulary size (spacy):", len(w2i_spacy))

# Calculate the term-frequency matrix (TF)

There are actually many ways to calculate the TF matrix. See [wikipedia](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Term_frequency). Here, we use the simple cound of the terms as TF. 

In [None]:
def get_tf_matrix(docs_tokenized, word2idx):
    tf = np.zeros((len(docs_tokenized), len(word2idx)))
    for i, doc in enumerate(docs_tokenized):
        for token in doc:
            tf[i, token] += 1

    return tf


tf_nltk = get_tf_matrix(docs_tok_nltk, w2i_nltk)
tf_spacy = get_tf_matrix(docs_tok_spacy, w2i_spacy)

# Calculate the inverse document-frequency (IDF)

Also for the inverse document frequency, the IDF matrix, we can find multiple
implementations. See [wikipedia](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency).

In [None]:
def get_idf(tf):
    # Total number of documents
    n = tf.shape[0]

    # Number of documents containing each word
    ni = (tf > 0).sum(axis=0)

    # Inverse document frequency (smooth)
    idf = np.log(n / (1 + ni)) + 1

    # remove singlton dimension
    idf = np.array(idf).squeeze()

    return idf


idf_nltk = get_idf(tf_nltk)
idf_spacy = get_idf(tf_spacy)
print(idf_nltk.shape)
print(idf_spacy.shape)

# Combine TF and IDF

In [None]:
def get_tfidf(tf, idf):
    return tf * idf

In [None]:
tfidf_nltk = get_tfidf(tf_nltk, idf_nltk)
tfidf_spacy = get_tfidf(tf_spacy, idf_spacy)

# Use sklearn's implementation

In [None]:
sk_tfidf = TfidfVectorizer(lowercase=True, norm=None)
tfidf_sklearn = np.array(sk_tfidf.fit_transform(df["text"]).todense())
tfidf_sklearn.shape
i2w_sklearn = {i: w for i, w in enumerate(sk_tfidf.get_feature_names_out())}  # noqa

# Test the system

In [None]:
def get_top5(i, i2w, tfidf):
    row = df.iloc[i]
    print("Label:", row["labels"])
    print("Text:", row["text"].split("\n", 1)[0])
    print("Top 5 terms:")

    scores = tfidf[i]
    indices = (-scores).argsort()

    for j in indices[:5]:
        print(i2w[j], scores[j])


i = np.random.choice(len(df))
get_top5(i, i2w_nltk, tfidf_nltk)
print()
get_top5(i, i2w_spacy, tfidf_spacy)
print()
get_top5(i, i2w_sklearn, tfidf_sklearn)