In [41]:
import pandas as pd

# File path to your CSV file
file_path = r'D:\E folder\big data lab\dataset.csv'  # Modify this to match the uploaded file name

# Load the data
data = pd.read_csv(file_path, nrows=300)  # Load only the first 500 rows

# Print the loaded data
print(data)


     ARTICLE_ID         TITLE                 SECTION_TITLE  \
0             0     Anarchism                  Introduction   
1             0     Anarchism     Etymology and terminology   
2             0     Anarchism                       History   
3             0     Anarchism  Anarchist schools of thought   
4             0     Anarchism   Internal issues and debates   
..          ...           ...                           ...   
295          28        Apollo                      See also   
296          28        Apollo                         Notes   
297          28        Apollo                    References   
298          28        Apollo                External links   
299          29  Andre Agassi                  Introduction   

                                          SECTION_TEXT  
0    \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1    \n\nThe term ''anarchism'' is a compound word ...  
2    \n\n===Origins===\nWoodcut from a Diggers docu...  
3    \nPortrait

In [34]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from textblob import TextBlob
import html.parser

In [35]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Keep only the 'ARTICLE_ID' and 'SECTION_TEXT' columns
data = data[['ARTICLE_ID', 'SECTION_TEXT']]

# Function to remove HTML tags from text
def clean_text(text):
    # Remove HTML tags
    clean_text = re.sub(r'<.*?>', '', text)
    # Convert text to lowercase
    clean_text = clean_text.lower()
    return clean_text

# Apply basic text cleaning to 'SECTION_TEXT' column
data['cleaned_text'] = data['SECTION_TEXT'].apply(clean_text)

# Tokenize each entry in the 'cleaned_text' column
data['tokens'] = data['cleaned_text'].apply(word_tokenize)

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Remove stopwords from the tokens
data['tokens_without_stopwords'] = data['tokens'].apply(remove_stopwords)

# Initialize PorterStemmer for stemming
stemmer = PorterStemmer()

# Initialize WordNetLemmatizer for lemmatization
lemmatizer = WordNetLemmatizer()

# Function to perform stemming on a list of tokens
def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

# Function to perform lemmatization on a list of tokens
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply stemming to the tokens without stopwords
data['stemmed_tokens'] = data['tokens_without_stopwords'].apply(stem_tokens)

# Apply lemmatization to the tokens without stopwords
data['lemmatized_tokens'] = data['tokens_without_stopwords'].apply(lemmatize_tokens)

# Print the preprocessed data
print(data[['ARTICLE_ID', 'lemmatized_tokens']])
print(data[['SECTION_TEXT','lemmatized_tokens']])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


     ARTICLE_ID                                  lemmatized_tokens
0             0  ['', 'anarchism, '', ', political, philosophy,...
1             0  [term, ``, anarchism, '', compound, word, comp...
2             0  [===origins===, woodcut, digger, document, wil...
3             0  [portrait, philosopher, pierre-joseph, proudho...
4             0  [consistent, anarchist, value, controversial, ...
..          ...                                                ...
995          89  [actinopterygians, divided, subclass, chondros...
996          89  [*, *, ``, actinopterygii, '', untamedscience....
997          90  ['', 'albert, einstein, '', ', (, ;, ;, 14, ma...
998          90  [===, early, life, education, ===, einstein, a...
999          90  [throughout, life, einstein, published, hundre...

[1000 rows x 2 columns]
                                          SECTION_TEXT  \
0    \n\n\n\n\n\n'''Anarchism''' is a political phi...   
1    \n\nThe term ''anarchism'' is a compound word ... 

In [36]:
# File path for the new file
output_file_path = 'preprocessed_data.csv'  # Specify the desired file path

# Save the preprocessed data to a new CSV file
data.to_csv(output_file_path, index=False)

print("Preprocessed data saved successfully to:", output_file_path)


Preprocessed data saved successfully to: preprocessed_data.csv


# MAPPER 1

In [43]:
from collections import defaultdict
import re

def custom_mapper(document):
    try:
        doc_id, text = document.strip().split('\t')  # Assuming the document format is "<docID>\t<text>"
        words = re.findall(r'\b\w+\b', text.lower())  # Tokenize the document and convert to lowercase
        word_count = defaultdict(int)  # Dictionary to store word counts for the document
        for word in words:
            word_count[word] += 1
        # Emit intermediate key-value pairs of the form (word, (docID, count))
        for word, count in word_count.items():
            yield word, (doc_id, count)
    except ValueError:
        # Handle cases where the line doesn't have the expected format
        print("Skipping line:", document)


def custom_reducer(intermediate):
    word_doc_count = defaultdict(dict)  # Dictionary to store word counts for each document
    for word, (doc_id, count) in intermediate:
        if doc_id not in word_doc_count[word]:
            word_doc_count[word][doc_id] = 0
        word_doc_count[word][doc_id] += count

    # Calculate TF and emit intermediate key-value pairs of the form (word, (docID, TF))
    for word, doc_counts in word_doc_count.items():
        total_words_in_doc = sum(doc_counts.values())
        for doc_id, count in doc_counts.items():
            tf = count / total_words_in_doc
            yield word, (doc_id, tf)


In [15]:
# Load data from a file
file_path = 'preprocessed_data.csv'  # Specify the path to your file
with open(file_path, 'r', encoding='utf-8') as file:
    documents = file.readlines()

# Apply mapper function to each document
intermediate_results = []
for document in documents:
    intermediate_results.extend(mapper(document))

# Apply reducer function to aggregate results
final_results = reducer(intermediate_results)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Skipping line: The original title was ''Animal Farm: A Fairy Story;'' U.S. publishers dropped the subtitle when it was published in 1946 and only one of the translations during Orwell's lifetime kept it. Other titular variations include subtitles like ""''A Satire''"" and ""''A Contemporary Satire''"". Orwell suggested the title ''Union des républiques socialistes animales'' for the French translation which abbreviates to URSA the Latin word for ""bear"" a symbol of Russia. It also played on the French name of the Soviet Union ''Union des républiques socialistes soviétiques''.

Skipping line: 

Skipping line: Orwell wrote the book between November 1943 and February 1944 when the UK was in its wartime alliance with the Soviet Union and the British people and intelligentsia held Stalin in high esteem a phenomenon Orwell hated. The manuscript was initially rejected by a number of British and American publishers including one of Orwell's own Victor Gollancz which delayed its publication. I

# MAPPER 2

In [47]:
from collections import defaultdict
import math

def calculate_idf(documents):
    # Count the number of documents containing each term
    document_freq = defaultdict(int)
    for doc in documents:
        unique_words = set(doc)
        for word in unique_words:
            document_freq[word] += 1

    # Calculate IDF for each term
    idf = {}
    total_docs = len(documents)
    for word, freq in document_freq.items():
        idf[word] = math.log10(total_docs / freq)
    
    return idf

def second_mapper(word, docID, tf, idf):
    # Emit intermediate key-value pairs of the form (word, (docID, TF, IDF))
    yield word, (docID, tf, idf)

def second_reducer(intermediate):
    index_entries = defaultdict(list)
    # Aggregate the TF/IDF scores for each term in each document
    for word, (docID, tf, idf) in intermediate:
        index_entries[word].append((docID, tf * idf))

    # Output final index entries for each term, containing the TF/IDF scores and document IDs
    for word, entries in index_entries.items():
        yield word, entries

# Load preprocessed data
data_path = 'preprocessed_data.csv'  # Change the file path if needed
data = pd.read_csv(data_path)

# Extract documents as list of lists of tokens
documents = data['lemmatized_tokens'].apply(lambda x: x.split()).tolist()

# Calculate IDF values
idf_values = calculate_idf(documents)

# Apply second mapper function to each document
second_intermediate_results = []
for doc_id, tokens in enumerate(documents, start=1):
    for token in set(tokens):
        tf = tokens.count(token) / len(tokens)
        idf = idf_values[token]
        second_intermediate_results.extend(second_mapper(token, doc_id, tf, idf))

# Apply second reducer function to aggregate results
final_index_entries = list(second_reducer(second_intermediate_results))
for word, entries in final_index_entries:
    print(f"Word: {word}")
    for docID, tfidf in entries:
        print(f"  DocID: {docID}, TF-IDF: {tfidf}")


Word: '.']
  DocID: 1, TF-IDF: 0.0019902015570355714
  DocID: 2, TF-IDF: 0.0011372580325917552
  DocID: 3, TF-IDF: 6.837222640832194e-05
  DocID: 4, TF-IDF: 0.0001260951356094343
  DocID: 5, TF-IDF: 0.0016937885591792096
  DocID: 6, TF-IDF: 0.00023692875678994895
  DocID: 7, TF-IDF: 0.01592161245628457
  DocID: 10, TF-IDF: 0.00064547077525478
  DocID: 12, TF-IDF: 0.0009221011075068285
  DocID: 13, TF-IDF: 0.000275778506748578
  DocID: 14, TF-IDF: 0.0006015722590535732
  DocID: 15, TF-IDF: 0.00023254545943940464
  DocID: 16, TF-IDF: 0.00029303581207885715
  DocID: 17, TF-IDF: 0.001085564485655766
  DocID: 18, TF-IDF: 0.014926511677766785
  DocID: 19, TF-IDF: 0.000338757711835842
  DocID: 20, TF-IDF: 0.0018371091295712968
  DocID: 21, TF-IDF: 0.0020588291969333497
  DocID: 22, TF-IDF: 0.00047764837368853715
  DocID: 23, TF-IDF: 0.00046645348993021204
  DocID: 27, TF-IDF: 0.0009365654386049747
  DocID: 28, TF-IDF: 0.0007416900212554924
  DocID: 29, TF-IDF: 0.001020616183095165
  DocID: 30

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Word: 'pollard',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'բ',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'tibetan',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'indic',
  DocID: 703, TF-IDF: 0.008390580324360679
  DocID: 868, TF-IDF: 0.0009663336929237447
Word: 'maštoc',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'directionality',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'mashtots',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'reintroduced',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'hindi',
  DocID: 703, TF-IDF: 0.0055937202162404534
  DocID: 971, TF-IDF: 0.002690897312398822
Word: 'simplification',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'syllabic',
  DocID: 703, TF-IDF: 0.008390580324360679
  DocID: 806, TF-IDF: 0.006974082698542684
Word: 'mashdots',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'ե+ւ',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'clue',
  DocID: 703, TF-IDF: 0.0031088082901554403
Word: 'ա',
  D

# MAPPER 3

In [48]:
import pandas as pd

# Load the CSV file containing queries
queries_file_path = 'preprocessed_data.csv'
queries_data = pd.read_csv(queries_file_path)

# Print the column names to identify the correct column containing the queries
print("Column Names:", queries_data.columns)

# Define the tokenize_query function
def tokenize_query(query):
    # Tokenize the query here (replace this with your actual tokenization logic)
    return query.split()
# Apply tokenize_query function to each query in queries_data['tokens']
tokenized_queries = queries_data['tokens'].apply(tokenize_query)
print(tokenized_queries)



Column Names: Index(['ARTICLE_ID', 'SECTION_TEXT', 'cleaned_text', 'tokens',
       'tokens_without_stopwords', 'stemmed_tokens', 'lemmatized_tokens'],
      dtype='object')
0      [["''",, "'anarchism",, "''",, "'",, 'is',, 'a...
1      [['the',, 'term',, '``',, 'anarchism',, "''",,...
2      [['===origins===',, 'woodcut',, 'from',, 'a',,...
3      [['portrait',, 'of',, 'philosopher',, 'pierre-...
4      [['consistent',, 'with',, 'anarchist',, 'value...
                             ...                        
995    [['actinopterygians',, 'are',, 'divided',, 'in...
996    [['*',, '*',, '``',, 'actinopterygii',, "''",,...
997    [["''",, "'albert",, 'einstein',, "''",, "'",,...
998    [['===',, 'early',, 'life',, 'and',, 'educatio...
999    [['throughout',, 'his',, 'life',, 'einstein',,...
Name: tokens, Length: 1000, dtype: object


# reducer 3

In [52]:
import pandas as pd
import math
from collections import Counter

# Load preprocessed data
preprocessed_data_path = 'preprocessed_data.csv'
preprocessed_data = pd.read_csv(preprocessed_data_path)

# Function to calculate Term Frequency (TF) for each document
def calculate_term_frequency(document_id):
    tokens = preprocessed_data.loc[preprocessed_data['ARTICLE_ID'] == document_id, 'lemmatized_tokens'].iloc[0]
    term_freq = Counter(tokens)
    total_terms = len(tokens)
    tf = {term: freq / total_terms for term, freq in term_freq.items()}
    return tf

# Function to calculate Inverse Document Frequency (IDF) for each term
def calculate_inverse_document_frequency():
    doc_freq = Counter()
    for tokens in preprocessed_data['lemmatized_tokens']:
        doc_freq.update(set(tokens))
    num_documents = len(preprocessed_data)
    idf = {term: math.log(num_documents / (freq + 1)) for term, freq in doc_freq.items()}
    return idf

# Calculate TF for each document
document_term_frequencies = {}
for doc_id in preprocessed_data['ARTICLE_ID'].unique():
    document_term_frequencies[doc_id] = calculate_term_frequency(doc_id)

# Calculate IDF
term_idf = calculate_inverse_document_frequency()

# Function to calculate TF-IDF for each term in each document
def calculate_tfidf(document_id, term):
    tf = document_term_frequencies[document_id].get(term, 0)
    idf = term_idf.get(term, 0)
    return tf * idf

# Calculate TF-IDF for each term in each document
document_tfidf = {}
for doc_id, tf in document_term_frequencies.items():
    document_tfidf[doc_id] = {term: calculate_tfidf(doc_id, term) for term in tf.keys()}

# Display TF-IDF values for each document and term
for doc_id, tfidf_values in document_tfidf.items():
    print(f"Document {doc_id} TF-IDF:")
    for term, tfidf in tfidf_values.items():
        print(f"  {term}: {tfidf}")

# Display IDF values for each term
print("IDF:")
for term, idf in term_idf.items():
    print(f"  {term}: {idf}")


Document 0 TF-IDF:
  [: -7.164876939666662e-07
  ": 0.002136281049281597
  ': 0.016277866654844923
  ,: 0.008798387320015838
   : 0.008798387320015838
  a: 0.007809037841309092
  n: 0.006133793948372539
  r: 0.004690349991829108
  c: 0.0055099397266406185
  h: 0.004985866405163677
  i: 0.009585167019799506
  s: 0.006757473249653589
  m: 0.0037280327913468084
  p: 0.002245604459221826
  o: 0.005636616923396214
  l: 0.005717861980476114
  t: 0.006757880453664601
  y: 0.003091274155104702
  d: 0.003872380686835885
  v: 0.0019088080094602257
  e: 0.006528751236051201
  f: 0.0032970320163403896
  -: 0.0018309162142150204
  g: 0.001810953243279175
  b: 0.0007244307674628465
  u: 0.0034444892936090618
  .: 0.001986353417866454
  x: 0.002161405681510949
  w: 0.0007457115567564376
  ]: -7.164876939666662e-07
Document 1 TF-IDF:
  [: -3.8324399274674055e-07
  ": 0.0014283512575478468
  ': 0.01884058480613459
  ,: 0.010203341948326094
   : 0.010203341948326094
  a: 0.005787405276129108
  u: 0.0033

In [55]:
import pandas as pd
import math
from collections import Counter

# Load preprocessed data
preprocessed_data_path = 'preprocessed_data.csv'
preprocessed_data = pd.read_csv(preprocessed_data_path)

# Function to calculate Inverse Document Frequency (IDF) for each term
def calculate_idf(documents):
    doc_freq = Counter()
    for tokens in documents:
        doc_freq.update(set(tokens))
    num_documents = len(documents)
    idf = {term: math.log(num_documents / (freq + 1)) for term, freq in doc_freq.items()}
    return idf

# Calculate IDF
documents = preprocessed_data['tokens'].apply(lambda tokens: tokens.split()).tolist()
term_idf = calculate_idf(documents)

# Add IDF values to the preprocessed_data DataFrame
preprocessed_data['idf'] = preprocessed_data['tokens'].apply(lambda tokens: {term: term_idf[term] for term in set(tokens.split())})

# Check the DataFrame to verify the 'idf' column
print(preprocessed_data.head())

# Now extract term frequencies
def extract_term_frequencies(document_id):
    document_row = preprocessed_data[preprocessed_data['ARTICLE_ID'] == document_id]
    term_frequencies = dict(zip(document_row['tokens'], document_row['idf']))
    return term_frequencies

# Represent each document with its term frequencies
document_term_frequencies = {}
for doc_id in preprocessed_data['ARTICLE_ID'].unique():
    document_term_frequencies[doc_id] = extract_term_frequencies(doc_id)

# Display the term frequencies for each document
for doc_id, term_freqs in document_term_frequencies.items():
    print(f"Document {doc_id} Term Frequencies:")
    print(term_freqs)

# Display only the first 5 rows
print(preprocessed_data.head())

# Display only the first 10 rows
print(preprocessed_data.head(10))


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



   ARTICLE_ID                                       SECTION_TEXT  \
0           0  \n\n\n\n\n\n'''Anarchism''' is a political phi...   
1           0  \n\nThe term ''anarchism'' is a compound word ...   
2           0  \n\n===Origins===\nWoodcut from a Diggers docu...   
3           0  \nPortrait of philosopher Pierre-Joseph Proudh...   
4           0  \nconsistent with anarchist values is a contro...   

                                        cleaned_text  \
0  \n\n\n\n\n\n'''anarchism''' is a political phi...   
1  \n\nthe term ''anarchism'' is a compound word ...   
2  \n\n===origins===\nwoodcut from a diggers docu...   
3  \nportrait of philosopher pierre-joseph proudh...   
4  \nconsistent with anarchist values is a contro...   

                                              tokens  \
0  ["''", "'anarchism", "''", "'", 'is', 'a', 'po...   
1  ['the', 'term', '``', 'anarchism', "''", 'is',...   
2  ['===origins===', 'woodcut', 'from', 'a', 'dig...   
3  ['portrait', 'of', 'philoso