# HW1

In [69]:
# Information taken from:
# ChatGPT
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
# https://www.tutorialspoint.com/python_text_processing/python_tokenization.htm
# https://www.geeksforgeeks.org/nlp-how-tokenizing-text-sentence-words-works/

from sklearn.datasets import fetch_20newsgroups
import string

newsgroups_data = fetch_20newsgroups(data_home=None, subset='all', categories=None, shuffle=False, remove=(), download_if_missing=True, return_X_y=False)


# Function to tokenize each document
def tokenize_document(document):
    # Remove punctuation and special characters
    translator = str.maketrans('', '', string.punctuation)
    document = document.translate(translator)
    
    # Split the text using whitespace as a delimiter
    terms = document.split()
    
    return terms

# Tokenize each document in the dataset
tokenized_documents = [tokenize_document(doc) for doc in newsgroups_data.data]

# Example: Print the tokenized words of the first document
print("Original Document:")
print(newsgroups_data.data[10047])

print("\nTokenized Words:")
print(tokenized_documents[1])





Original Document:
From: ld231782@longs.lance.colostate.edu (L. Detweiler)
Subject: Privacy & Anonymity on the Internet FAQ (2 of 3)
Supersedes: <net-privacy/part2_733153240@GZA.COM>
Organization: TMP Enterprises
Lines: 1543
Expires: 21 May 1993 04:00:06 GMT
Reply-To: ld231782@longs.lance.colostate.edu
NNTP-Posting-Host: pad-thai.aktis.com
Summary: Email and account privacy, anonymity, file encryption, 
 academic computer policies, relevant legislation and references, 
 EFF, and other privacy and rights issues associated with use of the
 Internet and global networks in general.
X-Last-Updated: 1993/03/04

Archive-name: net-privacy/part2
Last-modified: 1993/3/3
Version: 2.1


IDENTITY, PRIVACY, and ANONYMITY on the INTERNET

(c) 1993 L. Detweiler.  Not for commercial use except by permission
from author, otherwise may be freely copied.  Not to be altered. 
Please credit if quoted.

SUMMARY

Email and account privacy, anonymity, file encryption,  academic 
computer policies, relevant leg

In [70]:
# Information and code borrowed from these sources:
# https://www.geeksforgeeks.org/inverted-index/
# ChatGPT


def create_inverted_index(tokenized_documents):
    inverted_index = {}

    for doc_id, terms in enumerate(tokenized_documents):
        for term in set(terms):  # Use set to consider unique terms in a document
            if term in inverted_index:
                inverted_index[term].append(doc_id)
            else:
                inverted_index[term] = [doc_id]

    # Sort the postings in increasing order of document identifiers
    for term in inverted_index:
        inverted_index[term] = sorted(inverted_index[term])

    return inverted_index

# Create the inverted index
inverted_index = create_inverted_index(tokenized_documents)

# Example: Print the inverted index for a few terms
print("Inverted Index for 'computer':", inverted_index.get('chicago', []))
print("Inverted Index for 'science':", inverted_index.get('bears', []))


Inverted Index for 'computer': [1844, 6967, 7269, 9076]
Inverted Index for 'science': [135, 244, 706, 1923, 2642, 3285, 3412, 4207, 4989, 5615, 5924, 7866, 8641, 8870, 10624, 10766, 12057, 12308, 12945, 13453, 14234, 14511, 17180, 17442, 17585, 18345, 18671]


In [71]:
def intersect_lists(list1, list2):
    result = []
    i, j = 0, 0

    while i < len(list1) and j < len(list2):
        if list1[i] == list2[j]:
            result.append(list1[i])
            i += 1
            j += 1
        elif list1[i] < list2[j]:
            i += 1
        else:
            j += 1

    return result

def sort_words_by_postings(words, index):
    sorted_words = sorted(words, key=lambda word: len(index.get(word, [])))
    return sorted_words



# Example usage of intersect_lists function
result_intersection = intersect_lists(inverted_index.get('computer', []), inverted_index.get('science', []))
print("Intersection of 'computer' and 'science':", result_intersection)

# Example usage of sort_words_by_postings function
words_to_sort = ['computer', 'science', 'programming', 'data']
sorted_words = sort_words_by_postings(words_to_sort, inverted_index)
print("Sorted words based on postings:", sorted_words)


Intersection of 'computer' and 'science': [1332, 1502, 2044, 2826, 2891, 3524, 3631, 4146, 4270, 4889, 4938, 5225, 5806, 5857, 6070, 6798, 7106, 7132, 7141, 8376, 8776, 10047, 10057, 10303, 10323, 10607, 10908, 11298, 11307, 12963, 13046, 13135, 13407, 14041, 14433, 14603, 14664, 15118, 15403, 15687, 15738, 16226, 16425, 16473, 16895, 18418, 18717]
Sorted words based on postings: ['programming', 'science', 'computer', 'data']


In [72]:
def search(query, inverted_index):
    # Tokenize the query
    query_terms = tokenize_document(query)

    # Sort the query terms based on the length of postings in the index
    sorted_query_terms = sort_words_by_postings(query_terms, inverted_index)

    # Initialize the result with the postings of the first term
    result = inverted_index.get(sorted_query_terms[0], [])

    # Intersect the postings for each subsequent term
    for term in sorted_query_terms[1:]:
        result = intersect_lists(result, inverted_index.get(term, []))

    return result

# Example usage of the search function
query = "computer OR science"
search_results = search(query, inverted_index)
print("Search Results for query '{}':".format(query))
print(search_results)


Search Results for query 'computer OR science':
[10047]
