# BDA Assignment #2: MapReduce
## Calculating the Term Frequency (TF), and Inverse Document Frequency (IDF).


#### Group members: 
Aaqib Ahmed Nazir (i22-1920),  
Arhum Khan (i22-1967), 
Ammar Khasif (i22-1968)

##### Section: DS-D   

#### Libraries Used

In [22]:
import nltk
import pandas as pd
from nltk.stem import WordNetLemmatizer


### Reading the preprocessed data

In [3]:
# Taking a chunk of 10000 rows from the dataset
file_path = "./Dataset/simplified_data.csv"
dataset = pd.read_csv(file_path)

display(dataset.head())

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TITLE,SECTION_TEXT
0,0,Anarchism,Introduction,anarchism anarchism is the only thing that mat...
1,0,Anarchism,Origins,the only thing matters except for anarchism is...
2,0,Anarchism,History,historical anarchism does not matter in this w...
3,1,World,Origins,world is the only thing that matters to people
4,1,World,History,world history is not the same as historical an...


### Creating a vocabulary of all the words in the documents and assigning a unique ID to each word

In [4]:
# creating a vocabulary
vocabulary = set()

for index, row in dataset.iterrows():
    words = row['SECTION_TEXT'].split()

    for word in words:
        if word not in vocabulary:
            vocabulary.add(word)

# saving the vocabulary to a file
with open("./Dataset/vocabulary.txt", 'w', encoding='utf-8') as f:
    for word in vocabulary:
        f.write(word + '\n')

# assigning an ID to each word
word_to_id = {word: idx for idx, word in enumerate(vocabulary)}

# saving the word_to_id mapping to a file
with open("./Dataset/vocab_id.csv", 'w', encoding='utf-8') as f:
    for word, idx in word_to_id.items():
        f.write(f"{word},{idx}\n")

### Term Frequency:

Term Frequency (TF) is a measure of how often a word appears in a document. It is calculated as the number of times a word appears in a document divided by the total number of words in the document.

In [5]:
article_word_counts = {}
prev_article_id = None
word_count = {}

for index, row in dataset.iterrows():
    # get the word counts for each article
    article_id = row['ARTICLE_ID']
    words = row['SECTION_TEXT'].split()

    if article_id != prev_article_id:
        word_count = {}
        prev_article_id = article_id
    
    for word in words:
        word_id = word_to_id[word]
        if word_id in word_count:
            word_count[word_id] += 1
        else:
            word_count[word_id] = 1
    
    # save the word counts for each article
    article_word_counts[article_id] = [(word_id, count) for word_id, count in word_count.items()]

    
# saving term frequencies to a csv file
article_word_counts_df = pd.DataFrame([(article_id, word_id, count) for article_id, word_counts in article_word_counts.items() for word_id, count in word_counts], columns=['article_id', 'word_id', 'frequency'])
article_word_counts_df.to_csv('./Dataset/term_frequencies.csv', index=False)

### IDF:

For a given word, the Inverse Document Frequency (IDF) is the logarithm of the total number of documents divided by the number of documents containing the word.

In [6]:
word_frequency = {}

# count the number of articles in which each word appears
for article_id, word_counts in article_word_counts.items():
    for word_id, count in word_counts:
        if word_id in word_frequency:
            word_frequency[word_id] += 1
        else:
            word_frequency[word_id] = 1
            
# save the word_frequency dictionary to a csv file
word_frequency_df = pd.DataFrame(list(word_frequency.items()), columns=['word_id', 'frequency'])
word_frequency_df.to_csv('./Dataset/inverse_doc_frequencies.csv', index=False)

#### Calculating the TF-IDF Weights:

In [7]:
# load the tf and idf files
tf_df = pd.read_csv("./Dataset/term_frequencies.csv")
idf_df = pd.read_csv("./Dataset/inverse_doc_frequencies.csv")

weights = []
for index, row in tf_df.iterrows():
    # get tf and idf values for each word
    article_id = row['article_id']
    word_id = row['word_id']
    tf = row['frequency']
    idf = idf_df[idf_df['word_id'] == word_id]['frequency'].values[0]

    weights.append((article_id, word_id, tf/idf))
    
# save the tf_idf values to a csv file
tf_idf_df = pd.DataFrame(weights, columns=['article_id', 'word_id', 'tf_idf'])
tf_idf_df.to_csv('./Dataset/weights.csv', index=False)

### Calculating the sparse vector representation for each document

In [8]:
# initialize the sparse vector dictionary
sparse_vectors = {}

# iterate over the tf_idf dataframe
for index, row in tf_idf_df.iterrows():
    article_id = row['article_id']
    word_id = row['word_id']
    tf_idf = row['tf_idf']

    # if the article_id is not in the dictionary, add it
    if article_id not in sparse_vectors:
        sparse_vectors[article_id] = {}

    # add the word_id and its tf_idf weight to the inner dictionary
    sparse_vectors[article_id][word_id] = tf_idf

# print the sparse vector representation for each document
for article_id, vector in sparse_vectors.items():
    print(f"Article ID: {article_id}, Vector: {vector}")


Article ID: 0.0, Vector: {8.0: 2.0, 10.0: 1.0, 21.0: 1.5, 12.0: 1.5, 9.0: 1.5, 22.0: 1.0, 4.0: 1.5, 2.0: 1.0, 0.0: 1.0, 11.0: 2.0, 18.0: 1.0, 6.0: 1.0, 13.0: 0.5, 1.0: 1.0, 17.0: 0.5, 7.0: 1.0, 14.0: 1.0, 23.0: 1.0, 20.0: 0.5}
Article ID: 1.0, Vector: {20.0: 1.0, 10.0: 1.0, 21.0: 1.0, 12.0: 0.5, 9.0: 0.5, 22.0: 0.5, 4.0: 0.5, 5.0: 1.0, 16.0: 1.0, 15.0: 1.0, 17.0: 0.5, 19.0: 1.0, 3.0: 1.0, 13.0: 0.5, 8.0: 0.5}


### Converting query to a vector using the TF-IDF weights

In [23]:
# query
query = "Travels around the wolrd"

# clean the query by removing punctuation, converting to lowercase and splitting into words
query = query.replace(',', '').replace('.', '').lower().split()

lemmatizer = WordNetLemmatizer()

# Lemmatize the query words
lemmatized_query = [lemmatizer.lemmatize(word) for word in query]

# get the word ids for the lemmatized query words
lemmatized_query_word_ids = []
for word in lemmatized_query:
    if word in word_to_id:
        lemmatized_query_word_ids.append(word_to_id[word])

# get the tf_idf values for the lemmatized query words
lemmatized_query_weights = []
for word_id in lemmatized_query_word_ids:
    if word_id in word_frequency:
        idf = word_frequency[word_id]
        lemmatized_query_weights.append((word_id, 1/idf))

print(lemmatized_query_weights)


[(21, 0.5)]


### Calculating the similarity between the query vector and the document vectors using scalar product

In [24]:
# Calculating the similarity between the query vector and the document vectors using scalar product
similarities = {}
for article_id, vector in sparse_vectors.items():
    similarity = 0
    for word_id, weight in lemmatized_query_weights:
        if word_id in vector:
            similarity += weight * vector[word_id]

    similarities[article_id] = similarity

# sort the similarities
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# print the top 5 most similar articles
for article_id, similarity in sorted_similarities[:5]:
    print(f"Article {int(article_id)}: {similarity}")

Article 0: 0.75
Article 1: 0.5
