In [1]:
import os
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Reading Dataset

In [2]:

def read_documents(corpusroot):
    documents = []
    names = []

    for filename in os.listdir(corpusroot):
        if filename.endswith(".txt"):  # Check if the file is a .txt file
            file_path = os.path.join(corpusroot, filename)
            try:
                with open(file_path, "r", encoding='windows-1252') as file:
                    doc = file.read().lower()  # Convert to lowercase
                    documents.append(doc)
                    names.append(filename)
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    return documents, names


Tokenizing Corupus

In [3]:
def tokenize(corpus):
    tokenized_corpus = []
    for doc in corpus:
        tokens = nltk.word_tokenize(doc)  # Tokenize the document
        tokenized_corpus.append(tokens)
    return tokenized_corpus

StopWords Removal from Corups

In [4]:
stop_words = set(stopwords.words('english'))

def filterstopwords(tokenized_corpus):
    cleaned_corpus = []
    for tokens in tokenized_corpus:
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stopwords
        cleaned_corpus.append(filtered_tokens)
    return cleaned_corpus

Stemming


In [5]:
stemmer = PorterStemmer()

def stemming(cleaned_corpus):
    stemmed_corpus = []
    for tokens in cleaned_corpus:
        stemmed_tokens = [stemmer.stem(word) for word in tokens]  # Stem the tokens
        stemmed_corpus.append(stemmed_tokens)
    return stemmed_corpus

Preprocessing the Documents

In [6]:
# Load and process all documents
files = './US_Inaugural_Addresses'
corpus = []
doc_names = []

def preprocessing(files):
    documents, names = read_documents(files)
    tokenized_doc = tokenize(documents)
    filtered_doc = filterstopwords(tokenized_doc)
    stemmed_corpus = stemming(filtered_doc)

    return stemmed_corpus, names


corpus, doc_names = preprocessing(files)


Computing TF*IDF

In [7]:

# 1. Compute Document Frequency (DF)
def df(corpus):
    df_dict = defaultdict(int)
    for doc in corpus:
        unique_tokens = set(doc)
        for token in unique_tokens:
            df_dict[token] += 1
    return df_dict

df_dict = df(corpus)
N = len(corpus)

# 2. Function to calculate IDF for a term
def idf(term, df_dict, N):
    """Compute the IDF for a given term."""
    if term in df_dict:
        return math.log10(N / df_dict[term])
    else:
        return 0.0

# 3. Compute Term Frequency (TF)
def tf(token_list):
    tf_dict = defaultdict(int)
    for token in token_list:
        tf_dict[token] += 1
    return tf_dict

# 4. Compute TF-IDF for a document
def tfidf(tf_dict, df_dict, N):
    tfidf_dict = {}
    for token, tf in tf_dict.items():
        term_idf = idf(token, df_dict, N)
        tfidf_dict[token] = (1 + math.log10(tf)) * term_idf
    return tfidf_dict

# 5. Compute and normalize TF-IDF for the entire corpus
def tfidf_corpus(corpus, df_dict, N):
    corpus_tfidf = []
    for doc_tokens in corpus:
        term_freq = tf(doc_tokens)
        tfidf_vector =tfidf(term_freq, df_dict, N)
        # Normalize the TF-IDF vector
        norm_tfidf = {key: value / math.sqrt(sum(v ** 2 for v in tfidf_vector.values())) for key, value in tfidf_vector.items()}
        corpus_tfidf.append(norm_tfidf)
    return corpus_tfidf

corpus_tfidf = tfidf_corpus(corpus, df_dict, N)

# 6. Function to get IDF of a term
def getidf(term):
    term_stemmed = stemmer.stem(term.lower())
    return idf(term_stemmed, df_dict, N)

# 7. Function to get TF-IDF weight of a term in a specific document
def getweight(doc_name, term):
    try:
        # Find the index of the document in the doc_names list
        doc_index = doc_names.index(doc_name)
    except ValueError:
        raise ValueError(f"Document '{doc_name}' not found in doc_names list.")

    term_stemmed = stemmer.stem(term.lower())

    # Look up the TF-IDF weight for the term in the document's TF-IDF vector
    if term_stemmed in corpus_tfidf[doc_index]:
        return corpus_tfidf[doc_index][term_stemmed]
    else:
        return 0.0



Query Vector

In [8]:

def compute_query_vector(qstring):
    tokens = nltk.word_tokenize(qstring.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    stemmed = [stemmer.stem(word) for word in tokens]
    query_tf = tf(stemmed)
    query_weights = {token: (1 + math.log10(tf)) for token, tf in query_tf.items()}
    norm = math.sqrt(sum(weight ** 2 for weight in query_weights.values()))
    return {token: weight / norm for token, weight in query_weights.items()}


Cosine similarity

In [9]:

def cosine_similarity(vec1, vec2):
    dot_product = sum(vec1.get(k, 0) * vec2.get(k, 0) for k in vec1.keys())
    norm1 = math.sqrt(sum(val ** 2 for val in vec1.values()))
    norm2 = math.sqrt(sum(val ** 2 for val in vec2.values()))

    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

Bulding posting list


In [10]:
def posting_list(token):
    postings = []
    for doc_id, tfidf_vector in enumerate(corpus_tfidf):
        if token in tfidf_vector:
            weight = tfidf_vector[token]
            postings.append((doc_id, weight))

    return postings


Query function

In [11]:

def query(qstring, posting_limit=10):
    query_vector = compute_query_vector(qstring)
    actual_scores = defaultdict(float)

    found_valid_document = False
    token_present_corpus = False

    for token, q_weight in query_vector.items():
        # Build the postings list for this token
        postings = posting_list(token)

        # If postings are found for this token, mark that a token was found in the corpus
        if len(postings) > 0:
            token_present_corpus = True

        if not token_present_corpus:
            continue

        top_postings = sorted(postings, key=lambda x: -x[1])[:posting_limit]

        # Calculate scores for documents in the top postings
        for doc_id, doc_weight in top_postings:
            doc_vector = corpus_tfidf[doc_id]
            similarity = cosine_similarity(query_vector, doc_vector)
            actual_scores[doc_id] += similarity * q_weight
            if similarity > 0:
                found_valid_document = True  # Mark that we found a valid document

    # If no tokens were found in the corpus, return "None"
    if not token_present_corpus:
        return "None", 0.0

    # If no valid document was found in the top posting_list, return "fetch more"
    if not found_valid_document:
        return "fetch more", 0.0

    # If we have valid scores, return the best document
    if actual_scores:
        best_doc = max(actual_scores, key=actual_scores.get)
        return doc_names[best_doc], actual_scores[best_doc]


    return "None", 0.0

In [12]:
print("%.12f" % getidf('democracy'))
print("%.12f" % getidf('foreign'))
print("%.12f" % getidf('states'))
print("%.12f" % getidf('honor'))
print("%.12f" % getidf('great'))
print("--------------")
print("%.12f" % getweight('19_lincoln_1861.txt','constitution'))
print("%.12f" % getweight('23_hayes_1877.txt','public'))
print("%.12f" % getweight('25_cleveland_1885.txt','citizen'))
print("%.12f" % getweight('09_monroe_1821.txt','revenue'))
print("%.12f" % getweight('37_roosevelt_franklin_1933.txt','leadership'))
print("--------------")
print("(%s, %.12f)" % query("states laws"))
print("(%s, %.12f)" % query("war offenses"))
print("(%s, %.12f)" % query("british war"))
print("(%s, %.12f)" % query("texas government"))
print("(%s, %.12f)" % query("world civilization"))

0.698970004336
0.187086643357
0.057991946978
0.139661993429
0.033858267261
--------------
0.006540919067
0.008269301122
0.011832819908
0.027778670583
0.077574191456
--------------
(21_grant_1869.txt, 0.025605903046)
(20_lincoln_1865.txt, 0.180140227144)
(07_madison_1813.txt, 0.123636547202)
(15_polk_1845.txt, 0.052967825658)
(22_grant_1873.txt, 0.014834244680)
