In [10]:
import warnings
warnings.filterwarnings("ignore")

Feel free to alter the values of the "constants" I have declared in lines 10-12.

In [34]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import sys
import os
import math
import string
from nltk.stem import PorterStemmer
ps = PorterStemmer()

FILE_MATCHES = 1  #Only want the  top one
SENTENCE_MATCHES = 1  #Only want the top one
CORPUS= 'testCorpus' #for testing provided directories: one, testCorpus, corpus


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anerud001/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/anerud001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
def load_files(directory):
    """
    Given a directory name, return a dictionary mapping the filename of each
    `.txt` file inside that directory to the file's contents as a string.
    """
    corpus = dict()
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path) and filename.endswith(".txt"):
            with open(file_path, "r", encoding='utf8') as file:
                corpus[filename] = file.read()
    #feel free to print the corpus on small corpus though - I initally set CORPUS to one - so it is small
    #you should do this so you know what you are working with
    #print(corpus)#testing ONLY print this then comment it out
    return corpus

In [22]:
def process(document):
    """
    Given a document (represented as a string), return a list of all of the
    words in that document, in order.

    Process document by coverting all words to lowercase, and removing any
    punctuation or English stopwords.
    """
    #print(document)
    exclude = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words("english"))
    words = nltk.word_tokenize(document)#get every word in document in a list
    clean_words = []
    for word in words:
        word = word.lower()#all words to lowercase
        word = ps.stem(word)#stemming
        word = ''.join(ch for ch in word if ch not in exclude)#this removes punctuation
        if word not in stop_words and len(word)>0:# if it is not a stopword and it is not empty add it to the list
            clean_words.append(word)
    sorted_words = sorted(clean_words)
    #feel free to print the corpus on small corpus though - I initally set CORPUS to one - so it is small
    #you should do this so you know what you are working with
    #print(sorted_words)#testing ONLY print this then comment it out
    return sorted_words
        

In [23]:
def calculate_idfs(documents):
    """
    Given a dictionary of `documents` that maps names of documents to a list
    of words, return a dictionary that maps words to their IDF values.

    Any word that appears in at least one of the documents should be in the
    resulting dictionary.
    """
    # create an empty dictionary that will hold key word: document using word count
    counts = dict()
    #for each of the files (key) in documents(dictionary parameter you are given)
    for filename in documents:
        seen_words = set()
        for word in documents[filename]:
            if word not in seen_words:
                seen_words.add(word)
                try:
                    counts[word] += 1
                except KeyError:
                    counts[word] = 1
                    
    return {word: math.log(1+ (len(documents)/counts[word])) for word in counts}
        
        #create something that can store unique words
        #for each word in the string that is stored in the dictionary entry
            #if you have not seen the word already
                #add it to your unique word store
                #in your dictionary increment the number count for the number of documents using the word
                #if there is not the dictionary yet - make an enter with count of documents set to 1
    # You need to create a dictionary to hold key word: idf that you will return
    # I think it is easier to use a dictionary comprehension once you have the dictionary with the word count finished
    # for every word in dictionary of number of documents using word count
        #make an entry in dictionary entry using the word as the key and the calculated idf has the entry
    #return the idf dictionary

In [37]:
def relevant_documents(query, files, idfs, n):
    """
    Given a `query` (a set of words), `files` (a dictionary mapping names of
    files to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the filenames of the the `n` top
    files that match the query, ranked according to tf-idf.
    """
    #create an empty dictionary
    tf_idfs = dict()
    #for every entry in the files dictionary
    for filename in files:
        #make an entry into the empty dictionary with a key of the filename with an entry set to zero
        tf_idfs[filename] = 0
        #for every word in the query
        for word in query:
            #find the tf for the word and the idf for the word
            #accumulate the tf*idf for the entry for this file
            tf_idfs[filename] += files[filename].count(word)*idfs[word]
    #sort the dictionary for the highest tf*idf value
    #return a list of the top n files
    return[key for key, value in sorted(tf_idfs.items(), key=lambda item:item[1], reverse=True)][:n]

In [41]:
def relevant_sentences(query, sentences, idfs, n):
    """
    Given a `query` (a set of words), `sentences` (a dictionary mapping
    sentences to a list of their words), and `idfs` (a dictionary mapping words
    to their IDF values), return a list of the `n` top sentences that match
    the query, ranked according to idf. If there are ties, preference should
    be given to sentences that have a higher query term density.
    """
    #create an empty list to hold the best sentences
    best_sentences = list()
    #for every sentence in the list of sentences passed
    for sentence in sentences:
        #you need to keep track of two bits of information the cumulative idfs and term density for ties for each sentence
        sentence_values = [sentence, 0, 0]
        #for every word in the query
        for word in query:
            #if the word is in the sentence
            if word in sentence:
                #accumulate the idfs index 1
                sentence_values[1] += idfs[word]
                #accumulate term density idx 2
                sentence_values[2] += sentence.count(word)/len(sentence)
        #store the idfs and term density for the sentence
        best_sentences.append(sentence_values)
    #sort based on idf, if tie - use term density
    #return a list of the top n sentences
    return [sentence for sentence, idf, den in sorted(best_sentences, key=lambda item:(item[1], item[2]),reverse=True)][:n]

In [48]:
def main():

    # Calculate IDF values across files
    files = load_files(CORPUS)#uses function from above
    file_words = {
        filename: process(files[filename])#uses process function from above
        for filename in files
    }
    #for you to see what file_words is but comment out when you have seen what it is
    #print(file_words)
    
    #uncomment when ready to test
    
    #this uses your function
    file_idfs = calculate_idfs(file_words)
    
    
    # Prompt user for query
    query = set(process(input("Query: ")))
    
    try:
        # Determine top file matches according to TF-IDF
        filenames = relevant_documents(query, file_words, file_idfs, n=FILE_MATCHES)#Can alter the n by altering constant
    except KeyError:
        print("We cannot help for that request")
        return
        
    # Extract sentences from top files
    sentences = dict()
    for filename in filenames:
        for passage in files[filename].split("\n"):
            for sentence in nltk.sent_tokenize(passage):
                tokens = process(sentence)#use function from above
                if tokens: #is tokens is not empty
                    sentences[sentence] = tokens

    # Compute IDF values across sentences
    idfs = calculate_idfs(sentences)

    # Determine top sentence matches
    matches = relevant_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)#can alter the n
    for match in matches:
        print(match)
    


Before you start to code - run this using the CORPUS one and then with testCorpus so you understand the output that will be used as input. The comment out the print statements

In [52]:
#comment out when you want to run
main()

Query: how is scooby-doo
Scooby-Doo is an American animated franchise comprising many animated television series produced from 1969 to the present, as well as their derivative media.
