# BDA Assignment #2: MapReduce
## Calculating the Term Frequency (TF), and Inverse Document Frequency (IDF).


#### Group members: 
Aaqib Ahmed Nazir (i22-1920),  
Arhum Khan (i22-1967), 
Ammar Khasif (i22-1968)

##### Section: DS-D   

#### Libraries Used

In [35]:
import pandas as pd

### Reading the preprocessed data

In [51]:
# Taking a chunk of 10000 rows from the dataset
file_path = "./Dataset/simplified_data.csv"
dataset = pd.read_csv(file_path)

display(dataset.head())

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TITLE,SECTION_TEXT
0,0,Anarchism,Introduction,anarchism anarchism is the only thing that mat...
1,0,Anarchism,Origins,the only thing matters except for anarchism is...
2,0,Anarchism,History,historical anarchism does not matter in this w...
3,1,World,Origins,world is the only thing that matters to people
4,1,World,History,world history is not the same as historical an...


### Creating a vocabulary of all the words in the documents and assigning a unique ID to each word

In [52]:
# creating a vocabulary
vocabulary = set()

for index, row in dataset.iterrows():
    words = row['SECTION_TEXT'].split()

    for word in words:
        if word not in vocabulary:
            vocabulary.add(word)

# saving the vocabulary to a file
with open("./Dataset/vocabulary.txt", 'w', encoding='utf-8') as f:
    for word in vocabulary:
        f.write(word + '\n')

# assigning an ID to each word
word_to_id = {word: idx for idx, word in enumerate(vocabulary)}

# saving the word_to_id mapping to a file
with open("./Dataset/vocab_id.csv", 'w', encoding='utf-8') as f:
    for word, idx in word_to_id.items():
        f.write(f"{word},{idx}\n")

### Term Frequency:

Term Frequency (TF) is a measure of how often a word appears in a document. It is calculated as the number of times a word appears in a document divided by the total number of words in the document.

In [56]:
article_word_counts = {}
word_count = {}

for index, row in dataset.iterrows():
    # get the word counts for each article
    article_id = row['ARTICLE_ID']
    words = row['SECTION_TEXT'].split()
    
    for word in words:
        word_id = word_to_id[word]
        if word_id in word_count:
            word_count[word_id] += 1
        else:
            word_count[word_id] = 1
    
    # save the word counts for each article
    article_word_counts[article_id] = [(word_id, count) for word_id, count in word_count.items()]

    
# saving term frequencies to a csv file
article_word_counts_df = pd.DataFrame([(article_id, word_id, count) for article_id, word_counts in article_word_counts.items() for word_id, count in word_counts], columns=['article_id', 'word_id', 'frequency'])
article_word_counts_df.to_csv('./Dataset/term_frequencies.csv', index=False)

### IDF:

For a given word, the Inverse Document Frequency (IDF) is the logarithm of the total number of documents divided by the number of documents containing the word.

In [58]:
word_frequency = {}

# count the number of articles in which each word appears
for article_id, word_counts in article_word_counts.items():
    for word_id, count in word_counts:
        if word_id in word_frequency:
            word_frequency[word_id] += 1
        else:
            word_frequency[word_id] = 1
            
# save the word_frequency dictionary to a csv file
word_frequency_df = pd.DataFrame(list(word_frequency.items()), columns=['word_id', 'frequency'])
word_frequency_df.to_csv('./Dataset/inverse_doc_frequencies.csv', index=False)

#### Calculating the TF-IDF Weights:

In [40]:
# load the tf and idf files
tf_df = pd.read_csv("./Dataset/term_frequencies.csv")
idf_df = pd.read_csv("./Dataset/inverset_doc_frequencies.csv")

weights = []
for index, row in tf_df.iterrows():
    # get tf and idf values for each word
    article_id = row['article_id']
    word_id = row['word_id']
    tf = row['frequency']
    idf = idf_df[idf_df['word_id'] == word_id]['frequency'].values[0]

    weights.append((article_id, word_id, tf/idf))
    
# save the tf_idf values to a csv file
tf_idf_df = pd.DataFrame(weights, columns=['article_id', 'word_id', 'tf_idf'])
tf_idf_df.to_csv('./Dataset/weights.csv', index=False)

### Converting query to a vector using the TF-IDF weights

In [60]:
# query
query = "does historical anarchism matter"

# clean the query by removing punctuation, converting to lowercase and splitting into words
query = query.replace(',', '').replace('.', '').lower().split()

# get the word ids for the query words
query_word_ids = []
for word in query:
    if word in word_to_id:
        query_word_ids.append(word_to_id[word])


# get the tf_idf values for the query words
query_weights = []
for word_id in query_word_ids:
    if word_id in word_frequency:
        idf = word_frequency[word_id]

        query_weights.append((word_id, 1/idf))

query_weights

[(9, 0.5), (8, 0.5), (13, 0.5), (15, 0.5)]