In [1]:
# Import Necessary Libraries and Packages
import sys  
import csv
import spacy
from time import process_time

nlp = spacy.load("en_core_web_lg")

# Set global variables
debug = False

In [2]:
debug = True

In [23]:

if debug:
    start_time = process_time()

# Set the appropriate input file
movie_plots_import_file = "wiki_movie_plots_sample.csv"

# Setup the plots array
plots = []

# Open import file and export all plots and titles into plots[]
with open(movie_plots_import_file, 'r', encoding='utf-8') as movie_plot_csv_file:
    reader = csv.reader(movie_plot_csv_file)
    next(reader, None)  # Skip the csv header row
    count=0

    for row in reader:
        plots.append((row[1],nlp(row[7])))
        count = count + 1
        if(count % 500 == 0):
            print(count, " movies processed")
            
print("Processing Done")

if debug:
    end_time = process_time()
    print("Time to process ", count, " records: ", end_time - start_time)
    print("Average processing timer per record: ", (end_time- start_time)/count)

# Print example when finished
print(plots[100])


500  movies processed
1000  movies processed
1500  movies processed
2000  movies processed
2500  movies processed
3000  movies processed
Processing Done
Time to process  3248  records:  195.734375
Average processing timer per record:  0.060263046490147784
('Should a Woman Divorce?', Grace Roberts (played by Lea Leland), marries rancher Edward Smith, who is revealed to be a neglectful, vice-ridden spouse. They have a daughter, Vivian. Dr. Franklin (Leonid Samoloff) whisks Grace away from this unhappy life, and they move to New York under aliases, pretending to be married (since surely Smith would not agree to a divorce). Grace and Franklin have a son, Walter (Milton S. Gould). Vivian gets sick, however, and Grace and Franklin return to save her. Somehow this reunion, as Smith had assumed Grace to be dead, causes the death of Franklin. This plot device frees Grace to return to her father's farm with both children.[1])


In [19]:
# Define generic similarity function
def simularity_printer(firstId, secondId):
    if debug:
        start_time = process_time()
    print("Similarity of ", plots[firstId][0], " and ", plots[secondId][0], ":", plots[firstId][1].similarity(plots[secondId][1]))
    if debug:
        end_time = process_time()
        print("Simularity calculation runtime ", end_time - start_time)

# Define generic 'best match' function
def best_match(id):
    if debug:
        start_time = process_time()
    best_score = 0
    best_match = "None"
    for plot in plots:
        if plot == plots[id]:
            continue
        score = plots[id][1].similarity(plot[1])
        if score > best_score:
            best_score = score
            best_match = plot[0]
    print("Best match for ",plots[id][0]," was: ",best_match," with score of ", best_score)
    if debug:
        end_time = process_time()
        print("Time to find best match from ", len(plots), " films: ", end_time - start_time)
        print("Average search time per film: ", (end_time-start_time)/len(plots))

In [21]:
# Print some tests/examples
simularity_printer(10, 250)
simularity_printer(10, 300)
simularity_printer(27, 72)
simularity_printer(27, 207)
simularity_printer(27, 270)

best_match(2000)
best_match(2500)
best_match(3000)

Similarity of  Dream of a Rarebit Fiend  and  Which Woman? : 0.9371971157790536
Simularity calculation runtime  0.0
Similarity of  Dream of a Rarebit Fiend  and  The Turn in the Road : 0.9343359550730539
Simularity calculation runtime  0.0
Similarity of  A Christmas Carol  and  Dough and Dynamite : 0.9311458919462225
Simularity calculation runtime  0.0
Similarity of  A Christmas Carol  and  Face Value : 0.9541526327456455
Simularity calculation runtime  0.0
Similarity of  A Christmas Carol  and  The Girl Who Stayed at Home : 0.9412193928939658
Simularity calculation runtime  0.0
Best match for  Top Hat  was:  The Awful Truth  with score of  0.9830821187041965
Time to find best match from  34886  films:  0.28125
Average search time per film:  8.061973284412086e-06
Best match for  At the Circus  was:  Gold Diggers in Paris  with score of  0.9813152325737858
Time to find best match from  34886  films:  0.234375
Average search time per film:  6.718311070343404e-06
Best match for  The Big S

In [22]:
best_match(2915)

Best match for  Rangers of Fortune  was:  Lonesome Dove  with score of  0.9851557247821526
Time to find best match from  34886  films:  0.25
Average search time per film:  7.166198475032965e-06


In [6]:

# Take Plots[10] and Plots[100] and remove the stopwords
# TODO: Find more efficient way to remove stopwords
# TODO: Remove stopwords from ALL plots

# Create blank token list
first_token_list = []
# Tokenize plots - add each word to list
for token in plots[10][1]:
    first_token_list.append(token.text)

# Create a new string combining all the tokens (words) that are not stopwords
first_filtered_plot_str = ' '.join([str(token) for token in first_token_list if not nlp.vocab[token].is_stop])

# Repeat stopword removal for second plot
second_token_list = []
for token in plots[100][1]:
    second_token_list.append(token.text)      

second_filtered_plot_str = ' '.join([str(token) for token in second_token_list if not nlp.vocab[token].is_stop])

# Print standard simularity with stopwords
simularity_printer(10, 100)

# Print Filtered Stopword similarity to test stopword removal impact
print("Filtered Stopwords: ", nlp(first_filtered_plot_str).similarity(nlp(second_filtered_plot_str)))



Similarity of  Dream of a Rarebit Fiend  and  Should a Woman Divorce? : 0.8760381240648998
Filtered Stopwords:  0.7202336502162601


In [49]:
# Experiments with bag of words for keyword group mining
from bitarray import bitarray
from apyori import apriori


lexicon = {}
for plot in plots:
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct and not token.text == "\n":
            if token.lemma_ not in lexicon.keys():
                lexicon[token.lemma_] = 1
            else:
                lexicon[token.lemma_] += 1

print(list(lexicon)[0:25])
print(len(lexicon))

plot_word_bags = []
plot_word_lists = []

for plot in plots:
    plot_words = set()
    plot_words_list = []
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct:
            plot_words.add(token.lemma_)
    word_bag = bitarray()
    for word in lexicon:
        if word in plot_words:
            word_bag.append(1)
            plot_words_list.append(word)
        else:
            word_bag.append(0)
    plot_word_bags.append(word_bag)
    plot_word_lists.append(plot_words_list)

plot_num = 400
#print(plots[plot_num][0])
#print(plot_word_bags[plot_num])
#for i in range(0, len(lexicon)):
#    if plot_word_bags[plot_num][i]:
#        print(list(lexicon.keys())[i])

#Ignore compact bit vectors for now, see what happens with an out of the box apriori algorithm
results = list(apriori(plot_word_lists))
print("Apriori Test")
print("Total results: ", len(results))
for result in results:
    if(len(result.items) >= 3):
        print(result.items)

['bartender', 'work', 'saloon', 'serve', 'drink', 'customer', 'fill', 'stereotypically', 'irish', 'man', 'bucket', 'beer', 'Carrie', 'Nation', 'follower', 'burst', 'inside', 'assault', 'pull', 'hat', 'eye', 'dump', 'head', 'group', 'begin']
28380
Apriori Test
Total results:  487


TypeError: '>=' not supported between instances of 'frozenset' and 'int'