In [1]:
import pandas as pd
from google.colab import files

# Upload the dataset
uploaded = files.upload()
data = pd.read_csv("bbc-text.csv", encoding="latin1")

print(data.head())


# ============================
# TEXT PREPROCESSING
# ============================

import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def clean_text(sentence):
    sentence = sentence.translate(str.maketrans("", "", string.punctuation))

    sentence = sentence.lower()

    sentence = re.sub(r"\d+", " ", sentence)

    words = sentence.split()

    filtered = []
    for w in words:
        if w not in stop_words:
            filtered.append(w)

    return filtered

data["tokens"] = data["text"].apply(clean_text)
print(data.head())


# ============================
# BUILD VOCABULARY
# ============================

all_words = set()

for token_list in data["tokens"]:
    for w in token_list:
        all_words.add(w)

sorted_vocab = sorted(all_words)

vocab = {}
for i, w in enumerate(sorted_vocab):
    vocab[w] = i

print("Vocabulary size:", len(vocab))
print("First 20 words:", list(vocab.items())[:20])


# ============================
# BAG OF WORDS CREATION
# ============================

import numpy as np

bow_vectors = []

for token_list in data["tokens"]:
    vec = [0] * len(vocab)

    for w in token_list:
        if w in vocab:
            index = vocab[w]
            vec[index] += 1

    bow_vectors.append(vec)

bow_vectors = np.array(bow_vectors)
data["bow_vector"] = list(bow_vectors)

index_to_word_bow = {}
for word, idx in vocab.items():
    index_to_word_bow[idx] = word

first_bow = bow_vectors[0]

print("\nFirst 20 words in BOW of first document:")
for i in range(20):
    print(index_to_word_bow[i], first_bow[i])


# ============================
# TERM FREQUENCY
# ============================

def tf_calculation(tokens, vocab):
    length = len(vocab)
    tf_vec = np.zeros(length)
    total = len(tokens)

    for w in tokens:
        if w in vocab:
            idx = vocab[w]
            tf_vec[idx] += 1

    if total != 0:
        tf_vec = tf_vec / total

    return tf_vec

data["tf_vector"] = data["tokens"].apply(lambda x: tf_calculation(x, vocab))

index_to_word = {}
for w, j in vocab.items():
    index_to_word[j] = w

first_tf = data["tf_vector"].iloc[0]

print("\nFirst 20 TF values of first document:")
for i in range(20):
    print(index_to_word[i], round(first_tf[i], 4))


# ============================
# IDF CALCULATION
# ============================

import math

def idf_calculation(all_docs, vocab):
    N = len(all_docs)
    idf_list = []

    for word in vocab:
        df_count = 0
        for tokens in all_docs:
            if word in tokens:
                df_count += 1

        idf_score = math.log((N + 1) / (df_count + 1)) + 1
        idf_list.append(idf_score)

    return np.array(idf_list)

idf_vector = idf_calculation(data["tokens"], vocab)

print("\nFirst 20 IDF values:")
for i in range(20):
    print(index_to_word[i], round(idf_vector[i], 4))


# ============================
# TF-IDF
# ============================

def tfidf(tf_vec, idf_vec):
    return tf_vec * idf_vec

data["tfidf_vector"] = data["tf_vector"].apply(lambda x: tfidf(x, idf_vector))

first_tfidf = data["tfidf_vector"].iloc[0]

top_idx = np.argsort(first_tfidf)[-20:][::-1]

print("\nTop 20 TF-IDF words in first document:")
for idx in top_idx:
    print(index_to_word[idx], round(first_tfidf[idx], 4))


# ============================
# CATEGORY-BASED TF-IDF
# ============================

categories = data["category"].unique()

for cat in categories:
    print("\nCategory:", cat)

    group = data[data["category"] == cat]

    tfidf_matrix = np.stack(group["tfidf_vector"].values)

    avg_tfidf = tfidf_matrix.mean(axis=0)

    sorted_idx = np.argsort(avg_tfidf)[::-1]

    for i in range(10):
        w = index_to_word[sorted_idx[i]]
        s = round(avg_tfidf[sorted_idx[i]], 4)
        print(" ", w, s)


# ============================
# GLOBAL TF & SPECIAL WORDS
# ============================

bow_matrix = np.stack(data["bow_vector"].values)
global_tf = np.sum(bow_matrix, axis=0) / np.sum(bow_matrix)

tfidf_df = pd.DataFrame({
    "word": [index_to_word[i] for i in range(len(vocab))],
    "global_tf": global_tf,
    "idf": idf_vector
})

high_tf_low_idf = tfidf_df.sort_values(["global_tf", "idf"], ascending=[False, True]).head(10)
low_tf_high_idf = tfidf_df.sort_values(["global_tf", "idf"], ascending=[True, False]).head(10)

print("\nHigh TF & Low IDF:")
for _, row in high_tf_low_idf.iterrows():
    print(row["word"], row["global_tf"], row["idf"])

print("\nLow TF & High IDF:")
for _, row in low_tf_high_idf.iterrows():
    print(row["word"], row["global_tf"], row["idf"])


# ============================
# SAVE OUTPUT
# ============================

output = data[["category", "text", "tokens"]].copy()

high_words = [w for w in high_tf_low_idf["word"]]
low_words = [w for w in low_tf_high_idf["word"]]

output["High TF/Low IDF"] = ", ".join(high_words)
output["Low TF/High IDF"] = ", ".join(low_words)

output.to_csv("cleaned_bbc_text.csv", index=False, encoding="utf8")

print("\nFile saved: cleaned_bbc_text.csv")


Saving bbc-text.csv to bbc-text.csv
        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


        category                                               text  \
0           tech  tv future in the hands of viewers with home th...   
1       business  worldcom boss  left books alone  former worldc...   
2          sport  tigers wary of farrell  gamble  leicester say ...   
3          sport  yeading face newcastle in fa cup premiership s...   
4  entertainment  ocean s twelve raids box office ocean s twelve...   

                                              tokens  
0  [tv, future, hands, viewers, home, theatre, sy...  
1  [worldcom, boss, left, books, alone, former, w...  
2  [tigers, wary, farrell, gamble, leicester, say...  
3  [yeading, face, newcastle, fa, cup, premiershi...  
4  [ocean, twelve, raids, box, office, ocean, twe...  
Vocabulary size: 30171
First 20 words: [('aa', 0), ('aaa', 1), ('aaas', 2), ('aac', 3), ('aadc', 4), ('aaliyah', 5), ('aaltra', 6), ('aamir', 7), ('aan', 8), ('aara', 9), ('aarhus', 10), ('aaron', 11), ('abacus', 12), ('abandon', 13), ('abando