In [1]:
# Initialize an empty list to store the data
data = []

# Read the text file
with open('cleaned_data.txt', 'r', encoding='utf-8') as f:
    # Iterate over each line
    for line in f:
        # Split the line by tab delimiter
        article_id, article_text = line.strip().split(',')
        # Append a tuple of article_id and article_text to the list
        data.append((int(article_id), article_text))

# Display the list of tuples
print(data[4])

(4, 'alabama state southeastern region united state bordered tennessee north georgia florida mexico south mississippi alabama extensive populous state nearly alabama nation longest navigable inland waterway alabama nicknamed yellowhammer state state alabama known heart dixie cotton state state longleaf state flower camellia alabama capital montgomery largest population birmingham industrialized largest huntsville oldest mobile founded french colonist capital french louisiana american civil world alabama state southern suffered economic hardship continued dependence agriculture southern state alabama legislator disenfranchised african american white century despite growth major industry urban center white rural interest dominated state legislature urban interest african american markedly underrepresented following world alabama state economy changed primarily based agriculture diversified interest state economy century based management automotive finance manufacturing aerospace mineral 

In [2]:
import pandas as pd
import string
from collections import Counter

In [3]:
# Function to get term frequency of a document
def get_term_frequency(document, term_to_id):
    # Splitting document into terms
    terms = document.split()
    # Count the occurrences of each term in the document
    term_frequencies = Counter(terms)
    # Store and return the term frequency for document
    doc_tf = {term_to_id[term]: frequency for term, frequency in term_frequencies.items()}
    return doc_tf

In [4]:
# Function to get tf/idf weights for a document
def get_tf_idf_weight(doc_tf, inverse_doc_freq):
    result = {}
    for key in doc_tf.keys():
        if key in inverse_doc_freq:
            tf_idf_weight = doc_tf[key] / inverse_doc_freq[key]
            result[key] = round(tf_idf_weight, 2)
    return result

In [5]:
df = pd.DataFrame(data, columns=["ARTICLE_ID", "ARTICLE_TEXT"])
df

Unnamed: 0,ARTICLE_ID,ARTICLE_TEXT
0,0,anarchism political philosophy advocate selfgo...
1,1,autism neurodevelopmental disorder characteriz...
2,2,percentage diffusely reflected sunlight relati...
3,3,writing cursive form named plural first letter...
4,4,alabama state southeastern region united state...
5,5,achilles nereid cymothoe attic redfigure kanth...
6,6,abraham lincoln february april american politi...
7,7,aristotle aristotélēs ancient greek philosophe...


In [6]:
df.dtypes

ARTICLE_ID       int64
ARTICLE_TEXT    object
dtype: object

In [7]:
# Create an empty set to store unique words
vocabulary_set = set()

# Iterate over each row of the dataframe
for index, row in df.iterrows():
    # Tokenize the article text
    tokens = row['ARTICLE_TEXT'].split()
    # Add tokens to the vocabulary set
    vocabulary_set.update(tokens)

print("Vocabulary Set:")
vocabulary_set

Vocabulary Set:


{'penguin',
 'dried',
 'inequality',
 'tendered',
 'adjusted',
 'unexpressed',
 'precedent',
 'stressed',
 'stigma',
 'naming',
 'riley',
 'metabolic',
 'intersecting',
 'capable',
 'squatting',
 'philosophie',
 'livestock',
 'servant',
 'harold',
 'sleeping',
 'injustice',
 'revolutionist',
 'troilus',
 'musicologist',
 'complaint',
 'produced',
 'engraving',
 'influential',
 'irrelevant',
 'ergon',
 'agnostic',
 'combined',
 'lully',
 'baker',
 'autumn',
 'sundry',
 'sentence',
 'enlistment',
 'salem',
 'worldwide',
 'replaces',
 'botched',
 'catch',
 'organization',
 'erroneous',
 'measuring',
 'produce',
 'screening',
 'definitive',
 'refers',
 'light',
 'judgement',
 'confiscation',
 'acted',
 'confirms',
 'freshly',
 'exceedingly',
 'juxtaposition',
 'hinduism',
 'applies',
 'appointment',
 'massachusetts',
 'purchased',
 'potomac',
 'extensive',
 'palatalized',
 'oyster',
 'cosmetic',
 'quantification',
 'success',
 'conventional',
 'change',
 'retaliated',
 'sequence',
 'glacie

In [8]:
len(vocabulary_set)

8562

In [9]:
# Create a dictionary to map terms to unique IDs
term_to_id = {term: idx for idx, term in enumerate(sorted(vocabulary_set))}

# Print the term-to-ID mapping
term_to_id

{'aanother': 0,
 'aardvark': 1,
 'aaron': 2,
 'abandoned': 3,
 'abandoning': 4,
 'abandonment': 5,
 'abate': 6,
 'abbreviated': 7,
 'abbreviation': 8,
 'abdelrahim': 9,
 'aberrant': 10,
 'ability': 11,
 'abjad': 12,
 'ablackletter': 13,
 'abnormal': 14,
 'abnormality': 15,
 'aboard': 16,
 'abolish': 17,
 'abolishing': 18,
 'abolition': 19,
 'abolitionism': 20,
 'abolitionist': 21,
 'abound': 22,
 'abraham': 23,
 'absence': 24,
 'absent': 25,
 'absolute': 26,
 'absorb': 27,
 'absorbed': 28,
 'absorbs': 29,
 'absorption': 30,
 'abstention': 31,
 'abstentionism': 32,
 'abstract': 33,
 'abstraction': 34,
 'absurd': 35,
 'absurdum': 36,
 'abundance': 37,
 'abuse': 38,
 'abydus': 39,
 'academia': 40,
 'academic': 41,
 'academy': 42,
 'accent': 43,
 'accept': 44,
 'acceptance': 45,
 'accepted': 46,
 'accepting': 47,
 'access': 48,
 'accessed': 49,
 'accidental': 50,
 'accidentally': 51,
 'accompanied': 52,
 'accomplished': 53,
 'accord': 54,
 'accordance': 55,
 'according': 56,
 'accordingly'

In [10]:
# Initialize a dictionary to store term frequencies for each document
document_term_frequency = {}

# Iterate over each row of the DataFrame
for index, row in df.iterrows():
    doc_id = row["ARTICLE_ID"]
    document = row["ARTICLE_TEXT"]
    doc_tf = get_term_frequency(document, term_to_id)
    document_term_frequency[doc_id] = doc_tf

# Print the term frequency for each document
# for doc_id, tf in document_term_frequency.items():
#     print(f"Document {doc_id}: {tf}")

print(f"Document {0}: {document_term_frequency[0]}")

Document 0: {388: 111, 5778: 21, 5660: 13, 178: 8, 6917: 1, 7186: 25, 847: 11, 8286: 5, 4059: 6, 5269: 7, 2203: 2, 7353: 3, 338: 8, 6994: 2, 748: 5, 2100: 1, 7255: 2, 5150: 1, 678: 10, 3738: 1, 7350: 42, 8082: 1, 8123: 2, 3600: 1, 493: 1, 1307: 7, 2760: 1, 5311: 1, 751: 8, 3701: 1, 5337: 23, 1680: 1, 3790: 6, 6433: 2, 3937: 7, 4462: 2, 7630: 12, 8169: 3, 1741: 7, 3002: 2, 4387: 4, 3834: 5, 390: 214, 2570: 7, 4388: 1, 6373: 1, 463: 3, 4108: 2, 1596: 10, 1541: 3, 7621: 5, 4993: 6, 5488: 1, 5258: 1, 3148: 1, 2453: 3, 7111: 2, 5491: 5, 8474: 6, 4055: 5, 3171: 1, 3167: 1, 8018: 1, 7889: 9, 2936: 3, 4996: 3, 2917: 3, 6819: 25, 7802: 24, 2282: 2, 3323: 1, 7547: 1, 500: 1, 3971: 3, 1622: 2, 7412: 1, 2439: 3, 1271: 1, 7179: 26, 3972: 26, 7095: 3, 1462: 1, 1638: 1, 1634: 1, 409: 14, 7505: 3, 2195: 1, 6558: 1, 3500: 1, 405: 1, 4710: 1, 8450: 14, 6730: 4, 5996: 1, 5915: 1, 575: 1, 4364: 3, 574: 1, 606: 1, 7230: 2, 6274: 1, 4591: 1, 4177: 1, 8213: 1, 4003: 1, 8533: 1, 4193: 1, 3140: 36, 4295: 15, 8

In [11]:
inverse_document_frequency = {}
for term in vocabulary_set:
    doc_count = 0
    for index, row in df.iterrows():
        tokens = set(row["ARTICLE_TEXT"].split())
        if term in tokens:
            doc_count += 1
    inverse_document_frequency[term_to_id[term]] = doc_count

In [12]:
inverse_document_frequency

{5568: 1,
 2505: 1,
 3987: 1,
 7706: 1,
 147: 1,
 8090: 1,
 5886: 2,
 7426: 2,
 7385: 1,
 5015: 1,
 6662: 2,
 4770: 1,
 4113: 1,
 1219: 1,
 7314: 1,
 5659: 1,
 4491: 1,
 6976: 1,
 3604: 2,
 7150: 2,
 4030: 2,
 6634: 1,
 7967: 1,
 4981: 1,
 1621: 1,
 6020: 3,
 2739: 3,
 4011: 3,
 4167: 2,
 2799: 1,
 221: 1,
 1562: 5,
 4555: 1,
 807: 3,
 770: 1,
 7526: 1,
 6958: 2,
 2746: 1,
 6762: 1,
 8475: 2,
 6494: 1,
 1060: 1,
 1268: 2,
 5344: 5,
 2808: 1,
 4717: 2,
 6019: 4,
 6839: 1,
 2106: 2,
 6362: 6,
 4445: 5,
 4244: 1,
 1696: 1,
 114: 3,
 1694: 1,
 3294: 1,
 2901: 1,
 4258: 1,
 3721: 1,
 531: 1,
 536: 1,
 4679: 1,
 6157: 2,
 5858: 1,
 2993: 1,
 5445: 1,
 5428: 1,
 1872: 1,
 6176: 2,
 7494: 3,
 1812: 1,
 1339: 7,
 6591: 1,
 6966: 2,
 3422: 1,
 8327: 1,
 612: 1,
 8233: 3,
 7820: 1,
 6889: 1,
 4930: 3,
 721: 1,
 5712: 1,
 2973: 1,
 5127: 1,
 1707: 1,
 4507: 1,
 4867: 1,
 3607: 1,
 3963: 1,
 5255: 1,
 7191: 1,
 6280: 1,
 3888: 2,
 5634: 1,
 6434: 5,
 6536: 4,
 6953: 1,
 297: 1,
 7822: 1,
 1531: 1,


In [13]:
tf_idf_weights = {}
for doc_id, tf in document_term_frequency.items():
    tf_idf_weight = get_tf_idf_weight(tf, inverse_document_frequency)
    tf_idf_weights[doc_id] = tf_idf_weight

# Print the tf/idf weights for each document
for doc_id, tf in tf_idf_weights.items():
    print(f"Document {doc_id}: {tf}")

Document 0: {388: 111.0, 5778: 5.25, 5660: 3.25, 178: 4.0, 6917: 1.0, 7186: 5.0, 847: 1.38, 8286: 5.0, 4059: 2.0, 5269: 0.88, 2203: 0.4, 7353: 3.0, 338: 1.0, 6994: 0.29, 748: 1.25, 2100: 0.2, 7255: 0.67, 5150: 1.0, 678: 1.67, 3738: 0.5, 7350: 7.0, 8082: 1.0, 8123: 2.0, 3600: 0.5, 493: 1.0, 1307: 1.0, 2760: 1.0, 5311: 0.5, 751: 2.67, 3701: 1.0, 5337: 23.0, 1680: 0.5, 3790: 1.5, 6433: 0.4, 3937: 0.88, 4462: 0.4, 7630: 1.71, 8169: 0.43, 1741: 1.17, 3002: 0.5, 4387: 4.0, 3834: 2.5, 390: 214.0, 2570: 2.33, 4388: 0.25, 6373: 0.5, 463: 3.0, 4108: 1.0, 1596: 10.0, 1541: 3.0, 7621: 5.0, 4993: 6.0, 5488: 1.0, 5258: 0.2, 3148: 0.5, 2453: 1.0, 7111: 0.33, 5491: 0.71, 8474: 1.0, 4055: 0.83, 3171: 1.0, 3167: 1.0, 8018: 0.2, 7889: 1.8, 2936: 0.75, 4996: 3.0, 2917: 3.0, 6819: 5.0, 7802: 4.0, 2282: 0.5, 3323: 1.0, 7547: 0.33, 500: 0.5, 3971: 3.0, 1622: 0.67, 7412: 1.0, 2439: 0.6, 1271: 0.33, 7179: 5.2, 3972: 26.0, 7095: 0.5, 1462: 0.25, 1638: 0.33, 1634: 0.2, 409: 7.0, 7505: 3.0, 2195: 0.14, 6558: 0.33

In [14]:
query = "immigrant repair activity"
query_tf = get_term_frequency(query, term_to_id)
query_tf_idf = get_tf_idf_weight(query_tf, inverse_document_frequency)

In [15]:
query_tf

{3874: 1, 6484: 1, 122: 1}

In [16]:
query_tf_idf

{3874: 1.0, 6484: 0.5, 122: 0.2}

In [17]:
def dot_product(doc_tf_idf, query_tf_idf):
    dot_product = 0
    for key in doc_tf_idf:
        if key in query_tf_idf:
            dot_product += doc_tf_idf[key] * query_tf_idf[key]
    return dot_product

In [18]:
query_relevance_scores = {}

for index, row in df.iterrows():
    doc_id = row["ARTICLE_ID"]
    doc_tf_idf = tf_idf_weights[doc_id]
    query_relevance_score = dot_product(doc_tf_idf, query_tf_idf)
    query_relevance_score = round(query_relevance_score, 2)
    query_relevance_scores[doc_id] = query_relevance_score

In [19]:
query_relevance_scores

{0: 0.04, 1: 0.24, 2: 0.08, 3: 0, 4: 4.29, 5: 0, 6: 0, 7: 0.82}

In [20]:
def print_top_n_pairs(dictionary, n):
    sorted_pairs = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)
    top_n_pairs = sorted_pairs[:n]
    
    print(f"Top {n} results:")
    for pair in top_n_pairs:
        print(f"Doc_ID: {pair[0]}, Score: {pair[1]}")

# Print top 3 pairs
print_top_n_pairs(query_relevance_scores, 3)


Top 3 results:
Doc_ID: 4, Score: 4.29
Doc_ID: 7, Score: 0.82
Doc_ID: 1, Score: 0.24
