In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import nltk
import re
import os
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
STOPWORDS = stopwords.words('english')
LEMMATIZER = WordNetLemmatizer()

In [5]:
def load_text_files(folder_path):
    data = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            filenames.append(filename)
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
    return filenames, data

In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\d+", "", text)
    tokens = word_tokenize(text)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]
    return cleaned_tokens


In [7]:
def term_frequency(term, document):
    return document.count(term) / len(document)

In [8]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [9]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [10]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [11]:
def main():
    folder_path = '/content/drive/MyDrive/dataset/documents'

    filenames, documents = load_text_files(folder_path)

    tokenized_docs = [clean_text(doc) for doc in documents]

    vocab = sorted(set([word for doc in tokenized_docs for word in doc]))

    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]

    queries = ["Harris' key economic plans",
               "Trump immigration plan this time around",
               "the big moments from the Harris vs. Trump debate"
               ]
    cleaned_queries = [clean_text(query) for query in queries]

    query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in cleaned_queries]

    cosine_similarities = []
    for query_vector in query_tfidf_vectors:
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
        cosine_similarities.append(similarities)

    output_file = 'output.txt'
    with open(output_file, 'w') as file:
        for i, query in enumerate(queries):
            file.write("\n")
            file.write("---------------------------------------------------------\n")
            file.write(f"Cosine similarities for query \"{query}\":\n")
            file.write("-------------------------------------------------------------\n")
            doc_similarities = [(filenames[j], cosine_similarities[i][j]) for j in range(len(documents))]
            sorted_similarities = sorted(doc_similarities, key=lambda x: x[1], reverse=True)
            for filename, similarity in sorted_similarities:
                file.write(f"{filename}: {similarity:.4f}\n")

    with open(output_file, 'r') as file:
        print(file.read())

if __name__ == "__main__":
    main()


---------------------------------------------------------
Cosine similarities for query "Harris' key economic plans":
-------------------------------------------------------------
Trump-Harris debate sums up policy-light US election.txt: 0.0699
US election latest.txt: 0.0304
2024 US election Kamala Harris's transformation.txt: 0.0223
The Harris–Trump debate showed US foreign policy matters in this election.txt: 0.0162
Mounting North Korean threats await next US president.txt: 0.0153
Arm the public with facts Microsoft billionaire fights US election disinformation.txt: 0.0098
US election polls Who is ahead - Harris or Trump.txt: 0.0035
Fears mount that election deniers could disrupt vote count in US swing.txt: 0.0013
Pope Francis tells US Catholics to choose ‘lesser evil’ in coming election.txt: 0.0009
State of the Union Germany's border politics and US election campaign fever.txt: 0.0000

---------------------------------------------------------
Cosine similarities for query "Trump im