In [48]:
#Setting up Environment
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the punkt_tab model



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [49]:
# Loading dictionary
dictionary = set()
with open('dictionary.txt', 'r') as file:
    for line in file:
        dictionary.add(line.strip())
print(f"Loaded {len(dictionary)} words into the dictionary.")


Loaded 1000 words into the dictionary.


In [50]:
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Preprocessing function
def preprocess_text(text, dictionary):

    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = nltk.word_tokenize(text)

    filtered_tokens = [word for word in tokens if word not in stop_words]

    processed_words = [stemmer.stem(word) for word in filtered_tokens if word in dictionary]

    return processed_words


In [51]:
# Sample Testing
sample_text = "The quick brown fox buy jumps apple over the  16 loss lazy dog.  campaign,, Foxes are faster than 50 dogs!"
preprocessed_words = preprocess_text(sample_text, dictionary)
print("Preprocessed Words:", preprocessed_words)


Preprocessed Words: ['brown', 'buy', '16', 'loss', 'campaign', '50']


In [52]:
import pandas as pd

# Load the CSV file
dataset = pd.read_csv('24_train_3.csv')

print(dataset.head())


   ArticleId                                               Text  Category
0       1429  sfa awaits report over mikoliunas the scottish...     sport
1       1896  parmalat to return to stockmarket parmalat  th...  business
2       1633  edu blasts arsenal arsenal s brazilian midfiel...     sport
3       2178  henman decides to quit davis cup tim henman ha...     sport
4        194  french suitor holds lse meeting european stock...  business


In [53]:
# Applying preprocessing to the 'Text'
dataset['Preprocessed_Text'] = dataset['Text'].apply(lambda x: preprocess_text(x, dictionary))

print(dataset[['ArticleId', 'Category', 'Preprocessed_Text']].head())


   ArticleId  Category                                  Preprocessed_Text
0       1429     sport  [report, scottish, report, 20, award, defeat, ...
1       1896  business  [return, went, back, stock, firm, 2003, eight,...
2       1633     sport  [hit, club, new, contract, deal, next, summer,...
3       2178     sport  [quit, cup, great, britain, cup, team, made, c...
4        194  business  [french, european, stock, market, met, london,...


In [37]:
# Saving the preprocessed data
dataset.to_csv('preprocessed_dataset.csv', index=False)


Define TFIDF Functions

Compute Term Frequency (TF):


In [54]:
import numpy as np
from collections import Counter

def compute_tf(word_list):
    """
    Compute term frequency for a document.
    Args:
        word_list (list): List of preprocessed words in a document.
    Returns:
        dict: Dictionary of word -> TF value.
    """
    term_count = Counter(word_list)
    max_count = max(term_count.values())
    tf = {word: count / max_count for word, count in term_count.items()}
    return tf


Compute Inverse Document Frequency (IDF):

In [55]:
def compute_idf(documents):
    """
    Compute inverse document frequency for all words.
    Args:
        documents (list): List of lists, where each sublist contains words in a document.
    Returns:
        dict: Dictionary of word -> IDF value.
    """
    n = len(documents)
    word_doc_count = Counter()
    for doc in documents:
        unique_words = set(doc)
        for word in unique_words:
            word_doc_count[word] += 1
    idf = {word: np.log10(n / count) for word, count in word_doc_count.items()}
    return idf


Compute TFIDF Matrix:

In [56]:
def compute_tfidf(documents, dictionary):
    """
    Compute the TFIDF matrix for all documents.
    Args:
        documents (list): List of preprocessed documents.
        dictionary (list): List of words from the dictionary.txt in order.
    Returns:
        np.array: TFIDF matrix with shape (num_documents, num_words).
    """
    idf = compute_idf(documents)
    tfidf_matrix = []

    for doc in documents:
        tf = compute_tf(doc)
        row = [round(tf.get(word, 0) * idf.get(word, 0), 4) for word in dictionary]
        tfidf_matrix.append(row)

    return np.array(tfidf_matrix)


Compute the TFIDF Matrix

In [57]:
# Extract preprocessed documents as a list
documents = dataset['Preprocessed_Text'].tolist()

dictionary_list = sorted(dictionary)

# Compute TFIDF matrix
tfidf_matrix = compute_tfidf(documents, dictionary_list)

# Saving the matrix to a file
np.savetxt('matrix.txt', tfidf_matrix, delimiter=',', fmt='%.4f')


Frequency Analysis

In [58]:
from collections import defaultdict

def compute_word_frequencies_by_category(dataset):
    """
    Compute word frequencies for each category.
    Args:
        dataset (pd.DataFrame): DataFrame with columns 'Category' and 'Preprocessed_Text'.
    Returns:
        dict: A dictionary mapping categories to word frequencies.
    """
    category_frequencies = defaultdict(Counter)

    for _, row in dataset.iterrows():
        category = row['Category']
        words = row['Preprocessed_Text']
        category_frequencies[category].update(words)

    return category_frequencies

# Computing word frequencies
word_frequencies_by_category = compute_word_frequencies_by_category(dataset)

# Extracting top 3 most frequent words for each category
top_frequent_words = {
    category: dict(frequencies.most_common(3))
    for category, frequencies in word_frequencies_by_category.items()
}

# Saving to JSON
import json
with open('frequency.json', 'w') as f:
    json.dump(top_frequent_words, f, indent=4)


TFIDF Analysis

In [59]:
def compute_tfidf_scores_by_category(tfidf_matrix, dataset, dictionary_list):
    """
    Compute average TFIDF scores for each word in each category.
    Args:
        tfidf_matrix (np.array): The TFIDF matrix.
        dataset (pd.DataFrame): DataFrame with 'Category'.
        dictionary_list (list): List of words in the dictionary.
    Returns:
        dict: A dictionary mapping categories to top TFIDF scores.
    """
    # Mapping categories to their document indices
    category_indices = defaultdict(list)
    for idx, category in enumerate(dataset['Category']):
        category_indices[category].append(idx)

    # Calculating average TFIDF scores by category
    category_scores = {}
    for category, indices in category_indices.items():
        category_matrix = tfidf_matrix[indices]
        avg_scores = category_matrix.mean(axis=0)
        top_words = sorted(
            zip(dictionary_list, avg_scores),
            key=lambda x: x[1],
            reverse=True
        )[:3]
        category_scores[category] = {word: round(score, 4) for word, score in top_words}

    return category_scores

# Computing TFIDF scores
top_tfidf_words = compute_tfidf_scores_by_category(tfidf_matrix, dataset, dictionary_list)


with open('scores.json', 'w') as f:
    json.dump(top_tfidf_words, f, indent=4)
