# BDA Assignment #2: MapReduce
## Calculating the Term Frequency (TF), and Inverse Document Frequency (IDF).


#### Group members: 
Aaqib Ahmed Nazir (i22-1920),  
Arhum Khan (i22-1967), 
Ammar Khasif (i22-1968)

##### Section: DS-D   

#### Libraries Used

In [1]:
import pandas as pd

### Reading the preprocessed data

In [11]:
# Taking a chunk of 10000 rows from the dataset
file_path = "./Dataset/data.csv"
dataset = pd.read_csv(file_path)

display(dataset.head())

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TITLE,SECTION_TEXT
0,0,Anarchism,Introduction,anarchism political philosophy advocates selfg...
1,0,Anarchism,Etymology and terminology,term anarchism compound word composed word ana...
2,0,Anarchism,History,zzorigins woodcut diggers document william eve...
3,0,Anarchism,Anarchist schools of thought,portrait philosopher pierrejoseph proudhon 180...
4,0,Anarchism,Internal issues and debates,consistent anarchist values controversial subj...


### Creating a vocabulary of all the words in the documents and assigning a unique ID to each word

In [13]:
# creating a vocabulary
vocabulary = set()

for index, row in dataset.iterrows():
    words = row['SECTION_TEXT'].split()

    for word in words:
        if word not in vocabulary:
            vocabulary.add(word)

# saving the vocabulary to a file
with open("./Dataset/vocabulary.txt", 'w', encoding='utf-8') as f:
    for word in vocabulary:
        f.write(word + '\n')

# assigning an ID to each word
word_to_id = {word: idx for idx, word in enumerate(vocabulary)}

# saving the word_to_id mapping to a file
with open("./Dataset/vocab_id.csv", 'w', encoding='utf-8') as f:
    for word, idx in word_to_id.items():
        f.write(f"{word},{idx}\n")

### Term Frequency:

Term Frequency (TF) is a measure of how often a word appears in a document. It is calculated as the number of times a word appears in a document divided by the total number of words in the document.

In [14]:
article_word_counts = {}

for index, row in dataset.iterrows():
    # get the word counts for each article
    article_id = row['ARTICLE_ID']
    words = row['SECTION_TEXT'].split()
    word_count = {}
    
    for word in words:
        word_id = word_to_id[word]
        if word_id in word_count:
            word_count[word_id] += 1
        else:
            word_count[word_id] = 1
    
    # save the word counts for each article
    article_word_counts[article_id] = [(word_id, count) for word_id, count in word_count.items()]
    
# saving term frequencies to a csv file
article_word_counts_df = pd.DataFrame([(article_id, word_id, count) for article_id, word_counts in article_word_counts.items() for word_id, count in word_counts], columns=['article_id', 'word_id', 'frequency'])
article_word_counts_df.to_csv('./Dataset/term_frequencies.csv', index=False)

### IDF:

For a given word, the Inverse Document Frequency (IDF) is the logarithm of the total number of documents divided by the number of documents containing the word.

In [15]:
word_frequency = {}

# count the number of articles in which each word appears
for article_id, word_counts in article_word_counts.items():
    for word_id, count in word_counts:
        if word_id in word_frequency:
            word_frequency[word_id] += 1
        else:
            word_frequency[word_id] = 1
            
# save the word_frequency dictionary to a csv file
word_frequency_df = pd.DataFrame(list(word_frequency.items()), columns=['word_id', 'frequency'])
word_frequency_df.to_csv('./Dataset/inverset_doc_frequencies.csv', index=False)

#### Calculating the TF-IDF Weights:

In [20]:
# load the tf and idf files
tf_df = pd.read_csv("./Dataset/term_frequencies.csv")
idf_df = pd.read_csv("./Dataset/inverset_doc_frequencies.csv")

weights = []
for index, row in tf_df.iterrows():
    # get tf and idf values for each word
    article_id = row['article_id']
    word_id = row['word_id']
    tf = row['frequency']
    idf = idf_df[idf_df['word_id'] == word_id]['frequency'].values[0]

    weights.append((article_id, word_id, tf/idf))
    
# save the tf_idf values to a csv file
tf_idf_df = pd.DataFrame(weights, columns=['article_id', 'word_id', 'tf_idf'])
tf_idf_df.to_csv('./Dataset/weights.csv', index=False)