<h1>Import Packages:</h1>

In [5]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd
import string
import re
import os
from collections import Counter
import glob

<h1>Custom Tokenizer:</h1>

In [6]:
"""
This cell downloads the built-in stop words from NLTK and provides a function to split, process, and tokenize text.
Processing includes stemming each word using NLTK's SnowballStemmer and removing punctuation using a regex.
"""

#Download NLTK's english stopwords
nltk.download('stopwords')
stopwords1 = stopwords.words("english")

#Regex to remove punctuation and NLTK SnowballStemmer
#TODO: RE_PUNCT isn't great at handling when punctuation actually should be included.  Needs to be replaced with something
#      more robust
RE_PUNCT = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE)
stemmer = SnowballStemmer("english")


"""
Function Name:  preprocess()

Parameters:     
    text = Some string/collection of strings
    
Example:  preprocess("This is a string")

Output: List containing all of the processed/stemmed words, Type: List
"""

#TODO: Add section to handle malformed input
def preprocess(text):
    res = []
    tokens = text.split()
    for token in tokens:
        if token not in stopwords1 and len(token) > 3 and token != "top.location.href=location.href":
            token = RE_PUNCT.sub(" ", token)
            token = stemmer.stem(token)
            res.append(token)
    return res

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
"""
WARNING: This cell requires a folder named scripts in the current directory in order to function properly

This section loops through all files in the specified directory, passes all the text in each file through the preprocess()
function and then loads it into the text corpus.  It also creates another list called "files" that contains the all of the
filenames

"""

#TODO: Add input section for filepath to handle any folder location 

#scripts = [x[2] for x in os.walk(str(os.getcwd()) + "\\scripts")]
os.chdir("scripts")
try:
    corpus = []
    files = []
    for filename in glob.glob("*.txt"):
        file = open(filename, 'r')
        tokens = preprocess(file.read())
        text = ' '.join(tokens)
        corpus.append(text)
        files.append(filename)
        file.close()
    os.chdir("..")
except:
    os.chdir("..")

In [22]:
"""
Load the processed text into a Pandas DataFrame
"""

documents_df = pd.DataFrame(files, columns = ['filename'])
documents_df["documents"] = corpus

#TODO: The documents_cleaned column may be redundant, possibly remove
documents_df["documents_cleaned"] = documents_df.documents.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ', w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ', w).lower() not in stopwords1))
documents_df.head(10)

Unnamed: 0,filename,documents,documents_cleaned
0,10-Things-I-Hate-About-You.txt,written karen mccullah lutz amp kirsten smit...,written karen mccullah lutz amp kirsten smith ...
1,12-and-Holding.txt,written anthoni cipriano twin boys rudi jacob...,written anthoni cipriano twin boys rudi jacob ...
2,12-Monkeys.txt,origin screenplay david peopl janet peopl insp...,origin screenplay david peopl janet peopl insp...
3,12-Years-a-Slave.txt,written john ridley close pair black hand open...,written john ridley close pair black hand open...
4,127-Hours.txt,written simon beaufoy amp danni boyl massiv ...,written simon beaufoy amp danni boyl massiv cr...
5,1492-Conquest-of-Paradise.txt,roselyn bosch septemb 1991 start man eleg sli...,roselyn bosch septemb start man eleg slip...
6,15-Minutes.txt,word czech airline pan across word side plane...,word czech airline pan across word side plane ...
7,17-Again.txt,written jason filardi octob 2007 car scatter p...,written jason filardi octob car scatter p...
8,187.txt,scott yagemann revis shoot draft novemb 1996 e...,scott yagemann revis shoot draft novemb e...
9,2001-A-Space-Odyssey.txt,screenplay stanley kubrick arthur clark hawk f...,screenplay stanley kubrick arthur clark hawk f...


# <h1>Doc2Vec Build and Training:</h1>

In [7]:
"""
Much of the Doc2Vec code is sourced from the article referenced at the bottom of this notebook.  
This cell downloads NLTK's built-in tokenizer punkt and applies it to every document in documents_cleaned

TODO: Gain better understanding of hyperparameters for Doc2Vec (vector_size, alpha, min_count, etc.) and 
      possibly adjust to optimize model
"""

nltk.download('punkt')

#TODO:  Again, this tokenizer section may be redundant, possibly remove
tagged_data = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(documents_df.documents_cleaned)]


model_d2v = Doc2Vec(vector_size=100,alpha=0.025, min_count=1)
model_d2v.build_vocab(tagged_data)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akim\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
"""
WARNING:  This cell may take a significant amount of time to run!  Depends on the number of epochs you specify and the specs of
          your machine
          
This section sets how many epochs the D2V model is trained for and then trains the model for that many epochs.
"""

epochs = input("Provide number of training epochs: ")
## 1 epoch takes about 4 minutes on my machine
for epoch in range(10):
    model_d2v.train(tagged_data,
                total_examples=model_d2v.corpus_count,
                epochs=model_d2v.epochs)

In [10]:
"""
This cell stores all of the document embeddings calculated from the doc2vec model into a list

"""
document_embeddings=np.zeros((documents_df.shape[0],100))
for i in range(len(document_embeddings)):
    document_embeddings[i]=model_d2v.docvecs[i]

  


# <h1>Similarity Calculation:</h1>

In [2]:
"""
The function displays the 5 most similar documents to a provided document based on a specified metric (cosine similarity,
or euclidean distance).

Function: most_similar(doc_id,similarity_matrix,matrix)

Parameters:
    doc_id = Index of document in "documents_df" DataFrame.  See Miscellaneous section at the bottom or search function 
             in User Input for more info on how to find doc_id
             
    similarity_matrix = Either pairwise_similarities (cosine similarity) or pairwise_differences(euclidean distance)
    
    matrix = Either "Cosine Similarity" as string, or "Euclidean Distance" as string
    
   
Example: most_similar(11,pairwise_similarities,'Cosine Similarity')

Output: Prints to console the document title followed by the title and similarity score of the 5 most similar documents

"""

#TODO: Add input section to specify how many similar documents the user wants to display
#TODO: Add section to handle malformed input
def most_similar(doc_id,similarity_matrix,matrix):
    print ('\n')
    print (f'Document: {documents_df.iloc[doc_id]["filename"]}')
    print ('\n')
    print ('Similar Documents:')
    print ('\n')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[doc_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[doc_id])
    for ix in similar_ix[:6]:
        if ix==doc_id:
            continue
        print (f'Document: {documents_df.iloc[ix]["filename"]}')
        print (f'{matrix} : {similarity_matrix[doc_id][ix]}')
        print ('\n')

In [14]:
"""
Calculate the cosine similarities and euclidean distances for the vectors in document_embeddings
"""

pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)

# <h1>User Input:</h1>

In [15]:
"""
The function takes in a document title as a string and returns its index in documents_df

Function: search(title)


Parameters:
    title = Document title as a string
    
Example: search("The Avengers")


Output: Index of the specified document if found, Type: Int. None if not found

"""

def search(title):
    word = title
    tokens = word.split()
    res = []
    for token in tokens:
        token = RE_PUNCT.sub("", token)
        res.append(token)
    boole = False
    title = ""
    for i in range(len(res)):
        if (res[i] == "The") and i == 0:
            boole = True
            continue
        cur = res[i]
        if i == len(res) -  1:
            if boole == True:
                title += cur + ",-The.txt"
            else:
                title += cur + ".txt"
        else:
            title += cur + "-"
    try:
        print(title)
        return documents_df[documents_df["filename"] == title].index.values.astype(int)[0]
    except:
        return

In [35]:
"""
This section requests a document name as input, then prints out the 5 most similar documents based on cosine similarity
and euclidean distance.

NOTE: These may not be the same lists of documents!
"""

term = input("Provide a Movie Title: ")
index = search(term)
try:
    most_similar(index,pairwise_similarities,'Cosine Similarity')
    print("-------------------------------------------------------------------------------------------")
    most_similar(index,pairwise_differences,'Euclidean Distance')
except:
    print("Movie not found")

Next.txt


Document: Next.txt


Similar Documents:


Document: TRON-Legacy.txt
Cosine Similarity : 0.4917086951760241


Document: Friday-the-13th.txt
Cosine Similarity : 0.4734951137488042


Document: Assassins.txt
Cosine Similarity : 0.4436002238968801


Document: Broken-Arrow.txt
Cosine Similarity : 0.44222796752546045


Document: Ninja-Assassin.txt
Cosine Similarity : 0.4271594428864344


-------------------------------------------------------------------------------------------


Document: Next.txt


Similar Documents:


Document: Ninja-Assassin.txt
Euclidean Distance : 19.98005259104115


Document: Assassins.txt
Euclidean Distance : 20.349210261699078


Document: Long-Kiss-Goodnight,-The.txt
Euclidean Distance : 20.453097332835984


Document: Hellboy-2-The-Golden-Army.txt
Euclidean Distance : 20.65944038413399


Document: Broken-Arrow.txt
Euclidean Distance : 20.93041360762031




# <h1>Export  to CSV Files:</h1>

In [24]:
"""
Adds a filename column to the document_embeddings DataFrame to mainting index-document association for future use
"""
newdf = pd.DataFrame(document_embeddings)
newdf["filename"] = documents_df["filename"]
newdf = newdf[[c for c in newdf if c in ["filename"]] + [d for d in newdf if d not in ["filename"]]]
newdf

Unnamed: 0,filename,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,10-Things-I-Hate-About-You.txt,-1.695494,-5.226159,-0.107940,-2.501784,2.781953,-0.499324,-2.424419,0.260161,-0.041970,...,-1.985297,0.162471,-3.304616,-0.091254,0.272466,0.801572,0.166671,-0.190785,2.598757,2.382200
1,12-and-Holding.txt,-3.933124,2.495834,1.707793,-0.178698,1.002381,-1.182322,-1.213156,-2.730105,0.773482,...,0.126593,3.623724,-3.633179,-2.412675,0.629609,-1.278768,1.284717,-0.511213,1.908127,-2.933471
2,12-Monkeys.txt,0.979229,-2.909092,1.554276,-1.672176,2.043409,-0.228946,1.858837,1.399809,0.539318,...,-0.941755,-0.401359,-5.139955,-1.320575,3.104685,1.147276,0.799477,-0.791513,3.614176,-4.387134
3,12-Years-a-Slave.txt,-1.986064,-0.311113,-0.578370,-0.629918,0.316444,-2.647151,-0.693074,-3.035693,-0.448140,...,-1.240939,-4.537358,-1.122413,4.196093,-0.300379,1.185855,-3.441469,3.037497,3.211800,-4.413497
4,127-Hours.txt,-0.021674,-3.155433,-1.103246,1.474490,-0.407338,0.853136,0.367200,-1.332900,-2.341174,...,-1.763430,0.074342,-3.135787,1.149290,-0.963436,1.913585,-0.457561,3.311870,0.910334,-1.389566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
949,You-Can-Count-On-Me.txt,-2.743547,1.553251,-0.321049,1.076263,-2.052466,-2.686030,-1.510546,-0.810123,-2.441458,...,0.009750,0.637446,-2.392711,-1.840004,1.098260,1.445133,-0.826735,1.655933,-2.039146,-0.266891
950,Youth-in-Revolt.txt,-1.558215,4.043688,0.004355,-0.479457,0.214844,1.738303,0.040430,-0.205494,-0.611698,...,-1.535653,4.131962,-1.593631,-2.437151,1.769728,1.533881,-3.058193,2.303478,1.352717,0.353871
951,Zero-Dark-Thirty.txt,-0.604094,-5.703767,-0.686788,-2.622568,1.095446,1.297923,0.445318,-0.818631,0.537166,...,0.929634,3.025182,-3.500828,1.804808,1.331443,3.945112,-0.667277,1.800655,0.676404,0.368759
952,Zerophilia.txt,-2.455432,-0.265861,3.578417,1.691035,1.314801,-1.981007,-4.344603,0.935932,-2.484688,...,-0.021298,1.165887,-1.294171,-1.052540,0.794281,3.936910,-0.500925,-0.956939,2.944508,0.212196


In [24]:
"""
Exports the documents_df and modified document_embeddings DataFrames to CSV files for use in other projects
"""
documents_df.to_csv("document_df.csv",index = False)
newdf.to_csv("document_embeddings.csv",index = False)

# <h1>Miscellaneous:</h1>

In [34]:
documents_df[documents_df["filename"] == "10-Things-I-Hate-About-You.txt"]

Unnamed: 0,filename,documents,documents_cleaned
0,10-Things-I-Hate-About-You.txt,written karen mccullah lutz amp kirsten smit...,written karen mccullah lutz amp kirsten smith ...


# <h1>Sources:</h1>

### Doc2Vec:

https://towardsdatascience.com/calculating-document-similarities-using-bert-and-other-models-b2c1a29c9630