In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\lilit\PycharmProjects\nlp-course-2025.1\Lilit Mnatsakanyan\data\bbc-text.csv")

In [2]:
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


##### 1.	Data Preprocessing:
###### a.	Clean the text data by removing punctuation, converting to lowercase, and removing stop words.
###### b.	Tokenize the text into individual words.


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lilit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lilit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lilit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [6]:
df['tokens'] = df['text'].apply(preprocess_text)


In [7]:
df

Unnamed: 0,category,text,tokens
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raids, box, office, ocean, twe..."
...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,"[cars, pull, us, retail, figures, us, retail, ..."
2221,politics,kilroy unveils immigration policy ex-chatshow ...,"[kilroy, unveils, immigration, policy, exchats..."
2222,entertainment,rem announce new glasgow concert us band rem h...,"[rem, announce, new, glasgow, concert, us, ban..."
2223,politics,how political squabbles snowball it s become c...,"[political, squabbles, snowball, become, commo..."


##### 2.	Implement Bag of Words:
###### a.	Create a function to build a vocabulary from the training set.
###### b.	Implement a function that converts a document into a BoW vector.
###### c.	Use your implementation to represent each document in both training and testing sets as a BoW vector.


In [8]:
from collections import Counter


In [9]:
def build_vocabulary(tokenized_texts):
    vocab = set()
    for tokens in tokenized_texts:
        vocab.update(tokens)
    return sorted(vocab)

vocab = build_vocabulary(df['tokens'])
vocab_index = {word: i for i, word in enumerate(vocab)}

In [10]:
len(vocab)

30190

In [11]:
def text_to_bow(tokens, vocab_index):
    vec = [0] * len(vocab_index)
    token_counts = Counter(tokens)
    for token, count in token_counts.items():
        if token in vocab_index:
            vec[vocab_index[token]] = count
    return vec

In [12]:
sample_vec = text_to_bow(df['tokens'][0], vocab_index)


In [13]:
sample_vec[:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [14]:
bow_vectors = [text_to_bow(tokens, vocab_index) for tokens in df['tokens']]


##### 3.	Implement TF-IDF:
###### a.	Create a function to calculate term frequency (TF) for each term in a document.
###### b.	Implement a function to calculate inverse document frequency (IDF) for each term in the corpus.
###### c.	Combine TF and IDF to create TF-IDF vectors for each document.


In [15]:
def compute_tf(tokens):
    tf = Counter(tokens)
    total_terms = len(tokens)
    return {word: count / total_terms for word, count in tf.items()}


In [16]:
from tqdm import tqdm
from math import log

def compute_idf(corpus):
    N = len(corpus)
    idf_dict = {}
    all_tokens = set([word for tokens in corpus for word in tokens])

    for word in tqdm(all_tokens, desc="Computing IDF"):
        df = sum(1 for tokens in corpus if word in tokens)
        idf_dict[word] = log(N / (1 + df))
    return idf_dict

In [17]:
idf_scores = compute_idf(df['tokens'])


Computing IDF: 100%|██████████| 30190/30190 [02:21<00:00, 213.69it/s]


In [18]:
def compute_tfidf(tokens, idf_scores):
    tf = compute_tf(tokens)
    tfidf = {word: tf[word] * idf_scores.get(word, 0) for word in tf}
    return tfidf

In [19]:
sample_tfidf = compute_tfidf(df['tokens'][0], idf_scores)


In [20]:
sample_tfidf

{'tv': 0.07044891009600367,
 'future': 0.010327813898301442,
 'hands': 0.016465864215777636,
 'viewers': 0.027274046752097318,
 'home': 0.008802506791119928,
 'theatre': 0.009425826004158352,
 'systems': 0.00826463580527434,
 'plasma': 0.013828183422409102,
 'highdefinition': 0.04265568908711416,
 'tvs': 0.02456738080343062,
 'digital': 0.012844039281540926,
 'video': 0.013028336065120553,
 'recorders': 0.02425023819341798,
 'moving': 0.008429597237307827,
 'living': 0.008606436302725735,
 'room': 0.009003589500653048,
 'way': 0.0069944866512508036,
 'people': 0.0176359341033261,
 'watch': 0.0335828280437538,
 'radically': 0.013045741822609263,
 'different': 0.006137656122257788,
 'five': 0.00434883535508087,
 'years': 0.0058833113222466395,
 'time': 0.005363375967899377,
 'according': 0.004659388701777986,
 'expert': 0.01066392227177854,
 'panel': 0.018364718825789772,
 'gathered': 0.010065342216108393,
 'annual': 0.006977832295195966,
 'consumer': 0.014390116469232488,
 'electronics'

##### 4.	Analysis:
###### a.	For a given category, find the top 10 words with the highest average TF-IDF scores.
###### b.	Identify words that have high TF scores but low IDF scores, and vice versa.


In [21]:
tqdm.pandas(desc="Computing TF-IDF vectors")


In [22]:
df['tfidf_scores'] = df['tokens'].progress_apply(lambda tokens: compute_tfidf(tokens, idf_scores))


Computing TF-IDF vectors: 100%|██████████| 2225/2225 [00:00<00:00, 9932.65it/s]


In [23]:
grouped_by_category = df.groupby('category')


In [24]:
for category_name, group in grouped_by_category:
    category_scores = Counter()
    for tfidf_dict in group['tfidf_scores']:
        category_scores.update(tfidf_dict)

    top_10_words = category_scores.most_common(10)

    print(f"category: {category_name.upper()}")
    for word, score in top_10_words:
        print(f"  - {word} (score: {score:.4f})")
    print("-" * 30)

category: BUSINESS
  - bn (score: 8.1105)
  - bank (score: 5.0100)
  - growth (score: 4.8450)
  - oil (score: 4.6176)
  - economy (score: 4.5411)
  - sales (score: 4.4416)
  - shares (score: 4.4120)
  - company (score: 4.2646)
  - us (score: 4.1522)
  - market (score: 4.1483)
------------------------------
category: ENTERTAINMENT
  - film (score: 9.4480)
  - best (score: 4.8221)
  - awards (score: 3.9099)
  - show (score: 3.8645)
  - music (score: 3.8244)
  - band (score: 3.7947)
  - festival (score: 3.5984)
  - award (score: 3.5448)
  - album (score: 3.4817)
  - actor (score: 3.2186)
------------------------------
category: POLITICS
  - mr (score: 6.9267)
  - labour (score: 6.9222)
  - blair (score: 5.8548)
  - party (score: 5.6749)
  - election (score: 5.6741)
  - government (score: 4.3815)
  - brown (score: 4.0844)
  - howard (score: 3.6134)
  - minister (score: 3.4789)
  - lord (score: 3.0356)
------------------------------
category: SPORT
  - england (score: 5.1521)
  - game (scor

In [25]:
idf_series = pd.Series(idf_scores)

low_idf_words = idf_series.sort_values(ascending=True).head(10)


In [26]:
low_idf_words

said     0.163709
also     0.563895
would    0.662607
one      0.775064
year     0.798757
new      0.820981
last     0.895167
could    0.932146
us       0.980080
two      1.004324
dtype: float64

In [27]:
high_idf_words = idf_series.sort_values(ascending=False).head(10)


In [28]:
high_idf_words

stalls         7.014365
lapete         7.014365
numerically    7.014365
thick          7.014365
phi            7.014365
strangers      7.014365
muggings       7.014365
revolving      7.014365
milverton      7.014365
kilobit        7.014365
dtype: float64

### for me

In [29]:
import numpy as np

def scores_to_vector(scores, vocab_index):
    vec = np.zeros(len(vocab_index))
    for word, score in scores.items():
        if word in vocab_index:
            vec[vocab_index[word]] = score
    return vec

In [30]:
tqdm.pandas(desc="Converting scores to vectors")
df['tfidf_vector'] = df['tfidf_scores'].progress_apply(lambda s: scores_to_vector(s, vocab_index))

Converting scores to vectors: 100%|██████████| 2225/2225 [00:00<00:00, 9418.76it/s]


In [31]:
category_centroids = {}
for category_name, group in df.groupby('category'):
    category_vectors = np.array(group['tfidf_vector'].tolist())
    centroid = np.mean(category_vectors, axis=0)
    category_centroids[category_name] = centroid

In [32]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0

    return dot_product / (norm_vec1 * norm_vec2)

In [33]:
def predict_topic(new_text, vocab_index, idf_scores, category_centroids):
    processed_tokens = preprocess_text(new_text)

    tfidf_scores_new = compute_tfidf(processed_tokens, idf_scores)
    new_vector = scores_to_vector(tfidf_scores_new, vocab_index)

    similarities = {}
    for category, centroid_vec in category_centroids.items():
        sim = cosine_similarity(new_vector, centroid_vec)
        similarities[category] = sim

    if not similarities:
        return "Unknown"

    predicted_category = max(similarities, key=similarities.get)

    return predicted_category

In [34]:
text_sport = "The 18-year-old has dealt with injury issues ever since, resulting in Hansi Flick publicly slamming Spain’s handling of Yamal’s fitness. Now, Barcelona’s No. 10 won’t be able to join his fellow clubmates in La Roja’s October camp."
text_tech = "Intel claims the first configurations will ship before the end of the year and then more broadly starting in January 2026. We don't have a complete lineup yet, but Panther Lake will include up to 16-core CPUs with a \"more than 50 percent faster CPU\" performance over the previous generation. Intel claims that the new integrated GPU with have up to 12 GPU cores that are also 50 percent faster than the prior generation, boosted by a new architecture."
text_politics = "President Donald Trump said Saturday his administration will make sure military personnel get paid Wednesday when their checks are due, despite the partial government shutdown. The Oct. 15 pay date for about 2.1 million troops had become one of the biggest near-term inflection points in the shutdown standoff, which has gone on for 11 days so far. In prior funding lapses that dragged on for weeks, military pay wasn’t an issue because the troops were paid through separate spending bills. "


In [35]:
predicted_cat_sport = predict_topic(text_sport, vocab_index, idf_scores, category_centroids)
predicted_cat_tech = predict_topic(text_tech, vocab_index, idf_scores, category_centroids)
predicted_cat_politics = predict_topic(text_politics, vocab_index, idf_scores, category_centroids)


In [36]:

print(f"Text: '{text_sport}'")
print(f"Predicted Topic: {predicted_cat_sport.upper()}\n")


Text: 'The 18-year-old has dealt with injury issues ever since, resulting in Hansi Flick publicly slamming Spain’s handling of Yamal’s fitness. Now, Barcelona’s No. 10 won’t be able to join his fellow clubmates in La Roja’s October camp.'
Predicted Topic: SPORT



In [37]:

print(f"Text: '{text_tech}'")
print(f"Predicted Topic: {predicted_cat_tech.upper()}\n")


Text: 'Intel claims the first configurations will ship before the end of the year and then more broadly starting in January 2026. We don't have a complete lineup yet, but Panther Lake will include up to 16-core CPUs with a "more than 50 percent faster CPU" performance over the previous generation. Intel claims that the new integrated GPU with have up to 12 GPU cores that are also 50 percent faster than the prior generation, boosted by a new architecture.'
Predicted Topic: TECH



In [38]:

print(f"Text: '{text_politics}'")
print(f"Predicted Topic: {predicted_cat_politics.upper()}\n")

Text: 'President Donald Trump said Saturday his administration will make sure military personnel get paid Wednesday when their checks are due, despite the partial government shutdown. The Oct. 15 pay date for about 2.1 million troops had become one of the biggest near-term inflection points in the shutdown standoff, which has gone on for 11 days so far. In prior funding lapses that dragged on for weeks, military pay wasn’t an issue because the troops were paid through separate spending bills. '
Predicted Topic: POLITICS

