### As before, we import the libraries and code bases we need

### This time, inverted_index and utils is mine... Note source files used in this context have to live within the context of where the jupyter notebook is being run...

In [None]:
# first install the required packages
!pip install nltk
!pip install scipy
!pip install numpy
import nltk
nltk.download('stopwords')


from inverted_index import InvertedIndex
from utils import read_data
inv_ind = InvertedIndex()

### Add documents below, read_data scans directory passed for any files ending in ".txt" and reads them in as a single string.

In [None]:
documents = read_data("./data")
print(documents)

### Print out number of documents and document titles

In [None]:
print(len(documents))
for d in documents:
    print(d[0])

### Next, we will add all these documents to our Inverted Index...

In [None]:
for d in documents:
    print(d[0])
    inv_ind.add_document(d)

### Print out some descriptives, total terms indexed and documents...

In [None]:
print(inv_ind.get_total_terms())
print(inv_ind.get_total_docs())

### Print out the inverted index itself....

In [None]:
inv_ind.print()

### Just for interest's sake, you can see nltk's built in stop word list

In [None]:
import nltk
print(nltk.corpus.stopwords.words('english'))

### From this we can geneate a term by document matrix 

In [None]:
print(inv_ind.generate_term_by_doc_matrix())

#### We can compute the LogEntropy values for everything in the Inverted Index
#### Display new values

In [None]:
inv_ind.calcLogEntropy()
inv_ind.generate_term_by_doc_matrix(log_entropy = True)

### Let's do a search....

In [None]:
results = inv_ind.search("scotland kings and thanes", log_entropy = True)
for r in results:
    print (r)


### We can deal with boolean queries too... let's get some data.

In [None]:
obama = inv_ind.get_document_set_from_term("Obama")
trump = inv_ind.get_document_set_from_term("Trump")
bernie = inv_ind.get_document_set_from_term("Bernie Sanders")
print(obama)
print(trump)
print(bernie)

### We have sets containing the infor for Obama, Trump and Sanders
### Which documents discuss all 3?

In [None]:
print(obama & trump & bernie)

### Let's use a larger dataset, we will use the complete works 
### of Shakespeare next... First, load it in.

In [None]:
documents = read_data("./shakespeare")
inv_ind = InvertedIndex()
for d in documents:
    print(d[0])

### Next, add them to the Inverted Index...
### Note some of these documents are 25,000 - 35,000 words...

In [None]:
for d in documents:
    print(d[0])
    inv_ind.add_document(d)

### We can reproduce our boolean search gave in the lecture slides...

### Also see how many terms we have...

In [None]:
print(inv_ind.get_total_terms())
caes = inv_ind.get_document_set_from_term("Caesar")
brut = inv_ind.get_document_set_from_term("Brutus")
cap = inv_ind.get_document_set_from_term("Calpurnia")
print(caes)
print(brut)
print(cap)

### Using simple set operations...
### Which are the plays that have Caesar AND Brutus but *NOT* Calpurnia

In [None]:
print((caes & brut) - cap)


### Let's generate our TFIDF data for future work and generate a plain term by document matrix for queries.

In [None]:
#inv_ind.calcTFIDF()
inv_ind.generate_term_by_doc_matrix(tfidf = False)

### Now, let's do a query about a Shakespeare play...

In [None]:
results = inv_ind.search("scotland kings and thanes", tfidf = False)
for r in results:
    print (r)

### Is there anything weird going on here?

In [None]:
king_docs = inv_ind.get_document_set_from_term("Kings")
print(king_docs)

In [None]:
king_pl = inv_ind.get_postings_list_from_term("Kings")
print(king_pl)

In [None]:
scotland_pl = inv_ind.get_postings_list_from_term("Scotland")
print(scotland_pl)

In [None]:
thane_pl = inv_ind.get_postings_list_from_term("Thane")
print(thane_pl)