In [5]:
# This script contains solution for task (a)
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# New dependencies and related stuff below:
import nltk
nltk.download('punkt_tab') # I need to do this on my computer, may not be required for everyone
from nltk.stem import PorterStemmer 
from nltk import word_tokenize
ps = PorterStemmer()

### DATA

example_documents = ["This is a silly example",
                     "A better example",
                     "Nothing to see here",
                     "This is a great and long example"]
small_wiki = "wiki_files/enwiki-20181001-corpus.100-articles.txt"
large_wiki = "wiki_files/enwiki-20181001-corpus.1000-articles.txt"

# This is the function for part 2 task a
def stemming(documents):
    tokens = word_tokenize(documents)  # Do tokenization first,
    return [ps.stem(word) for word in tokens] 

# Document setup using TfidfVectorizer
def tf_document_setup(documents):
    tfv = TfidfVectorizer(tokenizer=stemming, lowercase=True, sublinear_tf=True, use_idf=True, norm="l2") # add new parameter "tokenizer"
    tf_matrix = tfv.fit_transform(documents).T.todense() 
    return tf_matrix, tfv

def user_query():
    print()
    user_input = input("Please Enter your query, type 'quit' to exit: ")
    print()
    return user_input

def input_checker(user_input):
    if user_input == "quit" or user_input == "":
           print("Exit")
           return False
    return True

# Compute cosine similarity scores
def retrieve_matches(query, tf_matrix, tfv):
    query_tf = tfv.transform([query]).todense()  # Convert query to tf-idf vector
    scores = np.dot(query_tf, tf_matrix)  # Compute cosine similarity score
    return scores

def tf_print_retrieved(scores, documents):
    if np.all(scores == 0):  
        print("No matching document")
    else:
        ranked_scores_and_doc_ids = sorted([(score, i) for i, score in enumerate(np.array(scores)[0]) if score > 0], reverse=True) # Rank the documents by similarity score
        
        print(f"Found {len(ranked_scores_and_doc_ids)} matches:")
        
        print_limit = 2  # Max number of results to display
        for rank, (score, i) in enumerate(ranked_scores_and_doc_ids[:print_limit]):
            print(f"The score of '{documents[i]}' is {score:.4f} (Document {i + 1})")
            
def main():
    while True:
        user_input = user_query()
        if not input_checker(user_input):
            break
        scores = retrieve_matches(user_input, tf_matrix, tfv)
        tf_print_retrieved(scores, example_documents)

if __name__ == "__main__":
    #documents = extract_wiki_articles(small_wiki) # Assign whatever list of strings you want to use as documents to this variable, comment this line if you don't want to use the small wiki
    documents = example_documents # uncomment this line to use example_documents
    tf_matrix, tfv = tf_document_setup(documents)  
    main()


[nltk_data] Downloading package punkt_tab to /home/tkzang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!





Please Enter your query, type 'quit' to exit:  examples



Found 3 matches:
The score of 'A better example' is 0.4738 (Document 2)
The score of 'This is a silly example' is 0.3650 (Document 1)



Please Enter your query, type 'quit' to exit:  quit



Exit
