In [1]:
# Import Necessary Libraries and Packages
import sys  
import csv
import spacy
from time import process_time

nlp = spacy.load("en_core_web_lg")

# Set global variables
debug = False

In [2]:
debug = True

In [3]:

if debug:
    start_time = process_time()

# Set the appropriate input file
# movie_plots_import_file = "wiki_movie_plots_sample.csv" # 3248 Movies
movie_plots_import_file = "wiki_movie_plots_deduped.csv" #34,892 Movies


# Setup the plots array
plots = []

# Open import file and export all plots and titles into plots[]
with open(movie_plots_import_file, 'r', encoding='utf-8') as movie_plot_csv_file:
    reader = csv.reader(movie_plot_csv_file)
    next(reader, None)  # Skip the csv header row
    count=0

    for row in reader:
        plots.append((row[1],nlp(row[7])))
        count = count + 1
        if(count % 500 == 0):
            print(count, " movies processed")
            
print("Processing Done")

if debug:
    end_time = process_time()
    print("Time to process ", count, " records: ", end_time - start_time)
    print("Average processing timer per record: ", (end_time- start_time)/count)

# Print example when finished
print(plots[100])


500  movies processed
1000  movies processed
1500  movies processed
2000  movies processed
2500  movies processed
3000  movies processed
3500  movies processed
4000  movies processed
4500  movies processed
5000  movies processed
5500  movies processed
6000  movies processed
6500  movies processed
7000  movies processed
7500  movies processed
8000  movies processed
8500  movies processed
9000  movies processed
9500  movies processed
10000  movies processed
10500  movies processed
11000  movies processed
11500  movies processed
12000  movies processed
12500  movies processed
13000  movies processed
13500  movies processed
14000  movies processed
14500  movies processed
15000  movies processed
15500  movies processed
16000  movies processed
16500  movies processed
17000  movies processed
17500  movies processed
18000  movies processed
18500  movies processed
19000  movies processed
19500  movies processed
20000  movies processed
20500  movies processed
21000  movies processed
21500  movie

In [4]:
# Define generic similarity function
def simularity_printer(firstId, secondId):
    if debug:
        start_time = process_time()
    print("Similarity of ", plots[firstId][0], " and ", plots[secondId][0], ":", plots[firstId][1].similarity(plots[secondId][1]))
    if debug:
        end_time = process_time()
        print("Simularity calculation runtime ", end_time - start_time)

# Define generic 'best match' function
def best_match(id):
    if debug:
        start_time = process_time()
    best_score = 0
    best_match = "None"
    for plot in plots:
        if plot == plots[id]:
            continue
        score = plots[id][1].similarity(plot[1])
        if score > best_score:
            best_score = score
            best_match = plot[0]
    print("Best match for ",plots[id][0]," was: ",best_match," with score of ", best_score)
    if debug:
        end_time = process_time()
        print("Time to find best match from ", len(plots), " films: ", end_time - start_time)
        print("Average search time per film: ", (end_time-start_time)/len(plots))

In [5]:
# Print some tests/examples
simularity_printer(10, 250)
simularity_printer(10, 300)
simularity_printer(27, 72)
simularity_printer(27, 207)
simularity_printer(27, 270)

best_match(2000)
best_match(2500)
best_match(3000)

Similarity of  Dream of a Rarebit Fiend  and  Which Woman? : 0.9371970590072015
Simularity calculation runtime  0.0010950000000775617
Similarity of  Dream of a Rarebit Fiend  and  The Turn in the Road : 0.9343358516138016
Simularity calculation runtime  0.0003790000000662985
Similarity of  A Christmas Carol  and  Dough and Dynamite : 0.9311459490912333
Simularity calculation runtime  0.00033600000006117625
Similarity of  A Christmas Carol  and  Face Value : 0.9541526327456455
Simularity calculation runtime  0.0002090000002681336
Similarity of  A Christmas Carol  and  The Girl Who Stayed at Home : 0.9412195067581498
Simularity calculation runtime  0.000315000000227883
Best match for  Top Hat  was:  The Awful Truth  with score of  0.9830819742761848
Time to find best match from  34886  films:  13.953007999999954
Average search time per film:  0.00039996009860688973
Best match for  At the Circus  was:  Gold Diggers in Paris  with score of  0.9813153116025161
Time to find best match from  

In [6]:
best_match(2915)

Best match for  Rangers of Fortune  was:  Lonesome Dove  with score of  0.9851555351836239
Time to find best match from  34886  films:  0.08033199999999852
Average search time per film:  2.30270022358535e-06


In [86]:
def top10_match(id):
    if debug:
        start_time = process_time()
    matches = [['BASE VALUE', 0]] # Schema: [ [title, score], [title, score]... ]
    
    for plot in plots:
        if plot == plots[id]:
            continue

        score = plots[id][1].similarity(plot[1]) # Get Similarity between current movie and match movie

        matches.append([plot[0], score]) # Add movie title and score to end of list
        matches = sorted(matches, key=lambda x: x[1], reverse=True) # Sort list by scores
        if len(matches) > 10: # Remove lowest score if length > 10
            matches.pop()
    
    end_time = process_time()
    # Print Results
    #print("The best matches for", plots[id][0], "are:")
    # i = 1
    # for match in matches:
    #     print(str(i) + ".", match[0], "\t", match[1])
    #     i+=1

    # print("\nTime to find best match from ", len(plots), " films: ", end_time - start_time)
    # print("Average search time per film: ", (end_time-start_time)/len(plots))

    return (matches)

top10_match(3331)

[['Diary of a Mad Black Woman', 0.9897136333215154],
 ['Night World', 0.9886594391290583],
 ['The Young Philadelphians', 0.9881840178885334],
 ['Peyton Place', 0.9879365684518179],
 ['Phone Call from a Stranger', 0.9878915190776927],
 ['Ten North Frederick', 0.9875224759181812],
 ['Devotion', 0.9872186071767584],
 ["St. Elmo's Fire", 0.9871606921712692],
 ['Red-Headed Woman', 0.9871487847600823],
 ['Chained', 0.9871140171446381]]

In [8]:
# Add column to Plots of plot without stopwords and punctuation - plots[id][2]
cleaned_plots = []
counter = 0

for plot in plots:
    if debug:
        counter+=1
        if (counter % 100 == 0):
            print(counter, "movie plots cleaned")
    token_list = []
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct and not token.text == "\n":
            token_list.append(token.text)

    cleaned_plots.append([plot[0], nlp(' '.join(str(token) for token in token_list))])

print(cleaned_plots[100][1])

100 movie plots cleaned
200 movie plots cleaned
300 movie plots cleaned
400 movie plots cleaned
500 movie plots cleaned
600 movie plots cleaned
700 movie plots cleaned
800 movie plots cleaned
900 movie plots cleaned
1000 movie plots cleaned
1100 movie plots cleaned
1200 movie plots cleaned
1300 movie plots cleaned
1400 movie plots cleaned
1500 movie plots cleaned
1600 movie plots cleaned
1700 movie plots cleaned
1800 movie plots cleaned
1900 movie plots cleaned
2000 movie plots cleaned
2100 movie plots cleaned
2200 movie plots cleaned
2300 movie plots cleaned
2400 movie plots cleaned
2500 movie plots cleaned
2600 movie plots cleaned
2700 movie plots cleaned
2800 movie plots cleaned
2900 movie plots cleaned
3000 movie plots cleaned
3100 movie plots cleaned
3200 movie plots cleaned
3300 movie plots cleaned
3400 movie plots cleaned
3500 movie plots cleaned
3600 movie plots cleaned
3700 movie plots cleaned
3800 movie plots cleaned
3900 movie plots cleaned
4000 movie plots cleaned
4100 movi

In [64]:
def top10_match_cleaned_plots(id):
    if debug:
        start_time = process_time()
    matches = [['BASE VALUE', 0]] # Schema: [ [title, score], [title, score]... ]
    
    for plot in cleaned_plots:
        if plot == cleaned_plots[id]:
            continue

        score = cleaned_plots[id][1].similarity(plot[1]) # Get Similarity between current movie and match movie

        matches.append([plot[0], score]) # Add movie title and score to end of list
        matches = sorted(matches, key=lambda x: x[1], reverse=True) # Sort list by scores
        if len(matches) > 10: # Remove lowest score if length > 10
            matches.pop()
    
    end_time = process_time()
    # Print Results
    # print("The best matches for", cleaned_plots[id][0], "are:")
    # i = 1
    # for match in matches:
        # print(str(i) + ".", match[0], "\t", match[1])
        # i+=1

    # print("\nTime to find best match from ", len(cleaned_plots), " films: ", end_time - start_time)
    # print("Average search time per film: ", (end_time-start_time)/len(cleaned_plots))

    return (matches)

top10_match_cleaned_plots(3331)

[['Diary of a Mad Black Woman', 0.9636018318161498],
 ['The Young Philadelphians', 0.9598892029942427],
 ['Devotion', 0.9592136819588972],
 ['Phone Call from a Stranger', 0.9578264969879424],
 ['The Man in the Net', 0.9569263445181613],
 ['Matchstick Men', 0.9566456674555075],
 ['The Last Time I Saw Paris', 0.9563847880225985],
 ['The Ref', 0.955756353615178],
 [' The Family That Preys', 0.9556927875827887],
 ['Tell It to the Judge', 0.9555039502925088]]

In [10]:

# Take Plots[10] and Plots[100] and remove the stopwords
# TODO: Find more efficient way to remove stopwords
# TODO: Remove stopwords from ALL plots

# Create blank token list
first_token_list = []
# Tokenize plots - add each word to list
for token in plots[10][1]:
    first_token_list.append(token.text)

# Create a new string combining all the tokens (words) that are not stopwords
first_filtered_plot_str = ' '.join([str(token) for token in first_token_list if not nlp.vocab[token].is_stop])

# Repeat stopword removal for second plot
second_token_list = []
for token in plots[100][1]:
    second_token_list.append(token.text)      

second_filtered_plot_str = ' '.join([str(token) for token in second_token_list if not nlp.vocab[token].is_stop])

# Print standard simularity with stopwords
simularity_printer(10, 100)

# Print Filtered Stopword similarity to test stopword removal impact
print("Filtered Stopwords: ", nlp(first_filtered_plot_str).similarity(nlp(second_filtered_plot_str)))



Similarity of  Dream of a Rarebit Fiend  and  Should a Woman Divorce? : 0.876038181411398
Simularity calculation runtime  0.00032900000041991007
Filtered Stopwords:  0.7202337177407835


In [11]:
# Experiments with bag of words for keyword group mining
from bitarray import bitarray
from apyori import apriori


lexicon = {}
for plot in plots:
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct and not token.text == "\n":
            if token.lemma_ not in lexicon.keys():
                lexicon[token.lemma_] = 1
            else:
                lexicon[token.lemma_] += 1

print(list(lexicon)[0:25])
print(len(lexicon))

plot_word_bags = []
plot_word_lists = []

for plot in plots:
    plot_words = set()
    plot_words_list = []
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct:
            plot_words.add(token.lemma_)
    word_bag = bitarray()
    for word in lexicon:
        if word in plot_words:
            word_bag.append(1)
            plot_words_list.append(word)
        else:
            word_bag.append(0)
    plot_word_bags.append(word_bag)
    plot_word_lists.append(plot_words_list)

plot_num = 400
#print(plots[plot_num][0])
#print(plot_word_bags[plot_num])
#for i in range(0, len(lexicon)):
#    if plot_word_bags[plot_num][i]:
#        print(list(lexicon.keys())[i])

#Ignore compact bit vectors for now, see what happens with an out of the box apriori algorithm
results = list(apriori(plot_word_lists, min_support = .1))
print("Apriori Test")
print("Total results: ", len(results))
for result in results:
    print(result.items)

['bartender', 'work', 'saloon', 'serve', 'drink', 'customer', 'fill', 'stereotypically', 'irish', 'man', 'bucket', 'beer', 'Carrie', 'Nation', 'follower', 'burst', 'inside', 'assault', 'pull', 'hat', 'eye', 'dump', 'head', 'group', 'begin']
145663
Apriori Test
Total results:  1791
frozenset({'accept'})
frozenset({'agree'})
frozenset({'allow'})
frozenset({'appear'})
frozenset({'arrest'})
frozenset({'arrive'})
frozenset({'ask'})
frozenset({'attack'})
frozenset({'attempt'})
frozenset({'away'})
frozenset({'begin'})
frozenset({'believe'})
frozenset({'body'})
frozenset({'boy'})
frozenset({'break'})
frozenset({'bring'})
frozenset({'brother'})
frozenset({'call'})
frozenset({'car'})
frozenset({'catch'})
frozenset({'cause'})
frozenset({'change'})
frozenset({'child'})
frozenset({'claim'})
frozenset({'close'})
frozenset({'come'})
frozenset({'confront'})
frozenset({'continue'})
frozenset({'convince'})
frozenset({'daughter'})
frozenset({'day'})
frozenset({'dead'})
frozenset({'death'})
frozenset({'de

In [12]:
def supportSorter(e):
  return e.support

results.sort( key=supportSorter)

for result in results:
  print(result.items, result.support)

frozenset({'discover', 'friend'}) 0.10001146591756005
frozenset({'family', 'find', 'leave'}) 0.10004013071146019
frozenset({'watch'}) 0.10006879550536031
frozenset({'escape', 'come'}) 0.10006879550536031
frozenset({'order', 'man'}) 0.10006879550536031
frozenset({'send', 'try'}) 0.10006879550536031
frozenset({'kill', 'go', 'try'}) 0.10006879550536031
frozenset({'shoot', 'man'}) 0.10009746029926045
frozenset({'kill', 'find', 'see'}) 0.10009746029926045
frozenset({'meet', 'wife'}) 0.10012612509316059
frozenset({'year', 'fall'}) 0.10015478988706071
frozenset({'tell', 'go', 'find', 'return'}) 0.10015478988706071
frozenset({'reveal', 'escape'}) 0.10018345468096085
frozenset({'turn', 'day'}) 0.10021211947486097
frozenset({'find', 'night', 'take'}) 0.10021211947486097
frozenset({'find', 'want', 'take'}) 0.10021211947486097
frozenset({'tell', 'know', 'take'}) 0.10021211947486097
frozenset({'plan', 'find', 'leave'}) 0.10024078426876111
frozenset({'tell', 'soon'}) 0.10026944906266123
frozenset({'

In [14]:
from collections import defaultdict
import csv

rated_movies = set()
rated_movies_dict = {}
#movie_ratings = set()
movie_viewers = defaultdict(lambda: list())
viewer_movies = defaultdict(lambda: list())


with open("movies_metadata.csv", 'r', encoding='utf-8') as movie_meta_data:
    movie_reader = csv.reader(movie_meta_data)
    next(movie_reader, None)  # Skip the csv header row

    for row in movie_reader:
        #print(row[5])
        #print(row[8])
        rated_movies.add((row[5],row[8]))
        rated_movies_dict[row[8]]=row[5]

#userId,movieId,rating,timestamp
with open("ratings.csv", 'r', encoding='utf-8') as movie_ratings_data:
    ratings_reader = csv.reader(movie_ratings_data)
    next(ratings_reader, None)
    count = 0

    for row in ratings_reader:
        count = count + 1
        if(count % 1000000 == 0):
            print(count, " ratings processed")
       # movie_ratings.add((row[0],row[1],row[2]))
        movie_viewers[row[1]].append((row[0],row[1],float(row[2])))
       # viewer_movies[row[0]].add((row[0],row[1],row[2]))
    print("Finished processing all ", count, " ratings")



1000000  ratings processed
2000000  ratings processed
3000000  ratings processed
4000000  ratings processed
5000000  ratings processed
6000000  ratings processed
7000000  ratings processed
8000000  ratings processed
9000000  ratings processed
10000000  ratings processed
11000000  ratings processed
12000000  ratings processed
13000000  ratings processed
14000000  ratings processed
15000000  ratings processed
16000000  ratings processed
17000000  ratings processed
18000000  ratings processed
19000000  ratings processed
20000000  ratings processed
21000000  ratings processed
22000000  ratings processed
23000000  ratings processed
24000000  ratings processed
25000000  ratings processed
26000000  ratings processed
27000000  ratings processed
Finished processing all  27753444  ratings


In [56]:
#Given a movie return the count of both how many people who liked the first movie liked the second movie
# and how many people who liked the first movie didn't like the second movie
def rating_comparison(base_movie_name, comparison_movie_name):
    try: 
        base_movie_id = rated_movies_dict[base_movie_name]
    except:
        #print("Base Movie Not Found in Ratings Dataset")
        return None
    
    try: 
        comparison_movie_id=rated_movies_dict[comparison_movie_name]
    except: 
        #print("Comparison Movie Not Found in Ratings Dataset")
        return None

    # print(base_movie_id)
    # print(comparison_movie_id)
    good_match = 0
    bad_match = 0
    no_opinion = 0 # number of viewers that liked the first movie, but have no rating on the second
    for base_viewer in movie_viewers[base_movie_id]:
        if(base_viewer[2] >= 4):
            for comparison_viewer in movie_viewers[comparison_movie_id]:
                if(comparison_viewer[0] == base_viewer[0]):
                    if(comparison_viewer[2] >= 4):
                        good_match = good_match + 1
                    elif(comparison_viewer[2] <= 3):
                        bad_match = bad_match +1
                    break
                else:
                    no_opinion += 1

    #print("Good Matches: ", good_match)
    #print("Bad Matches: ", bad_match)
    #print("No Opinions:", no_opinion)
    return(good_match, bad_match, no_opinion)


In [38]:
rating_comparison("The Matrix", "The Fifth Element")

Good Matches:  9
Bad Matches:  22
No Opinions: 951601


(9, 22, 951601)

In [70]:
movie_id = 17376

matches = top10_match_cleaned_plots(movie_id)

for match in matches:
    rating_comparison(plots[movie_id][0], match[0])

matches = top10_match(movie_id)

for match in matches:
    rating_comparison(plots[movie_id][0], match[0])


The best matches for Phantom Thread are:
1. The Barrets of Wimpole Street 	 0.9930038355474538
2. Morocco 	 0.9927843011450455
3. Trishna 	 0.9921982354638577
4. Quality Street 	 0.9920731727896657
5. A Streetcar Named Desire 	 0.9920014917844893
6. The Gilded Lily 	 0.9918777015393213
7. Far from the Madding Crowd 	 0.991824801511902
8. Far from the Madding Crowd 	 0.991824801511902
9. The Temptress 	 0.9917836113581286
10. Some Girls 	 0.991732340732002

Time to find best match from  34886  films:  0.10312900000008085
Average search time per film:  2.956171530129016e-06


In [88]:
# Search through all American movies, find top 10 matches, and try to find any matches that show up in the ratings comparison
# American movies stop at plots id 17376

movie_count = 17376

top_10_cleaned_review_results = []

for num in range(15001, movie_count):
    matches = top10_match_cleaned_plots(num)
    for match in matches:
        comps = rating_comparison(plots[num][0], match[0])
        if (comps != None):
            if (comps[0] + comps[1] + comps[2] > 0):
                # print(plots[num][0], "-", match[0], ":", comps[0], comps[1], comps[2])
                top_10_cleaned_review_results.append([plots[num][0], match[0], comps[0], comps[1], comps[2]])
    if (num % 100 == 0): print("--", num, "completed.")

-- 15100 completed.
-- 15200 completed.
-- 15300 completed.
-- 15400 completed.
-- 15500 completed.
-- 15600 completed.
-- 15700 completed.
-- 15800 completed.
-- 15900 completed.
-- 16000 completed.
-- 16100 completed.
-- 16200 completed.
-- 16300 completed.
-- 16400 completed.
-- 16500 completed.
-- 16600 completed.
-- 16700 completed.
-- 16800 completed.
-- 16900 completed.
-- 17000 completed.
-- 17100 completed.
-- 17200 completed.
-- 17300 completed.


In [89]:
good_count = 0
bad_count = 0
neutral_count = 0

for result in top_10_cleaned_review_results:
    if (result[2] > result[3]): good_count += 1
    elif (result[2] < result[3]): bad_count += 1
    else: neutral_count += 1

print("Good Matches:", good_count)
print("Bad Matches:", bad_count)
print("Neutral Matches:", neutral_count)

Good Matches: 188
Bad Matches: 141
Neutral Matches: 324


In [90]:
top_10_review_results = []

for num in range(15001, movie_count):
    matches = top10_match(num)
    for match in matches:
        comps = rating_comparison(plots[num][0], match[0])
        if (comps != None):
            if (comps[0] + comps[1] + comps[2] > 0):
                # print(plots[num][0], "-", match[0], ":", comps[0], comps[1], comps[2])
                top_10_review_results.append([plots[num][0], match[0], comps[0], comps[1], comps[2]])
    if (num % 100 == 0): print("--", num, "completed.")

-- 15100 completed.
-- 15200 completed.
-- 15300 completed.
-- 15400 completed.
-- 15500 completed.
-- 15600 completed.
-- 15700 completed.
-- 15800 completed.
-- 15900 completed.
-- 16000 completed.
-- 16100 completed.
-- 16200 completed.
-- 16300 completed.
-- 16400 completed.
-- 16500 completed.
-- 16600 completed.
-- 16700 completed.
-- 16800 completed.
-- 16900 completed.
-- 17000 completed.
-- 17100 completed.
-- 17200 completed.
-- 17300 completed.


In [92]:
len(top_10_review_results)

good_count = 0
bad_count = 0
neutral_count = 0

for result in top_10_review_results:
    if (result[2] > result[3]): good_count += 1
    elif (result[2] < result[3]): bad_count += 1
    else: neutral_count += 1

print("Good Matches:", good_count)
print("Bad Matches:", bad_count)
print("Neutral Matches:", neutral_count)

Good Matches: 190
Bad Matches: 140
Neutral Matches: 347
