In [None]:
# Import Necessary Libraries and Packages
import sys  
import csv
import spacy
from time import process_time

nlp = spacy.load("en_core_web_lg")

# Set global variables
debug = False

In [None]:
debug = True

In [None]:

if debug:
    start_time = process_time()

# Set the appropriate input file
# movie_plots_import_file = "wiki_movie_plots_sample.csv" # 3248 Movies
movie_plots_import_file = "wiki_movie_plots_deduped.csv" #34,892 Movies


# Setup the plots array
plots = []

# Open import file and export all plots and titles into plots[]
with open(movie_plots_import_file, 'r', encoding='utf-8') as movie_plot_csv_file:
    reader = csv.reader(movie_plot_csv_file)
    next(reader, None)  # Skip the csv header row
    count=0

    for row in reader:
        plots.append((row[1],nlp(row[7])))
        count = count + 1
        if(count % 500 == 0):
            print(count, " movies processed")
            
print("Processing Done")

if debug:
    end_time = process_time()
    print("Time to process ", count, " records: ", end_time - start_time)
    print("Average processing timer per record: ", (end_time- start_time)/count)

# Print example when finished
print(plots[100])


In [None]:
# Define generic similarity function
def simularity_printer(firstId, secondId):
    if debug:
        start_time = process_time()
    print("Similarity of ", plots[firstId][0], " and ", plots[secondId][0], ":", plots[firstId][1].similarity(plots[secondId][1]))
    if debug:
        end_time = process_time()
        print("Simularity calculation runtime ", end_time - start_time)

# Define generic 'best match' function
def best_match(id):
    if debug:
        start_time = process_time()
    best_score = 0
    best_match = "None"
    for plot in plots:
        if plot == plots[id]:
            continue
        score = plots[id][1].similarity(plot[1])
        if score > best_score:
            best_score = score
            best_match = plot[0]
    print("Best match for ",plots[id][0]," was: ",best_match," with score of ", best_score)
    if debug:
        end_time = process_time()
        print("Time to find best match from ", len(plots), " films: ", end_time - start_time)
        print("Average search time per film: ", (end_time-start_time)/len(plots))

In [None]:
# Print some tests/examples
simularity_printer(10, 250)
simularity_printer(10, 300)
simularity_printer(27, 72)
simularity_printer(27, 207)
simularity_printer(27, 270)

best_match(2000)
best_match(2500)
best_match(3000)

In [None]:
best_match(2915)

In [None]:
def top10_match(id):
    if debug:
        start_time = process_time()
    matches = [['BASE VALUE', 0]] # Schema: [ [title, score], [title, score]... ]
    
    for plot in plots:
        if plot == plots[id]:
            continue

        score = plots[id][1].similarity(plot[1]) # Get Similarity between current movie and match movie

        matches.append([plot[0], score]) # Add movie title and score to end of list
        matches = sorted(matches, key=lambda x: x[1], reverse=True) # Sort list by scores
        if len(matches) > 10: # Remove lowest score if length > 10
            matches.pop()
    
    end_time = process_time()
    # Print Results
    print("The best matches for", plots[id][0], "are:")
    i = 1
    for match in matches:
        print(str(i) + ".", match[0], "\t", match[1])
        i+=1

    print("\nTime to find best match from ", len(plots), " films: ", end_time - start_time)
    print("Average search time per film: ", (end_time-start_time)/len(plots))

top10_match(3331)

In [None]:
# Add column to Plots of plot without stopwords and punctuation - plots[id][2]
cleaned_plots = []
counter = 0

for plot in plots:
    if debug:
        counter+=1
        if (counter % 100 == 0):
            print(counter, "movie plots cleaned")
    token_list = []
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct and not token.text == "\n":
            token_list.append(token.text)

    cleaned_plots.append([plot[0], nlp(' '.join(str(token) for token in token_list))])

print(cleaned_plots[100][1])

In [None]:
def top10_match_cleaned_plots(id):
    if debug:
        start_time = process_time()
    matches = [['BASE VALUE', 0]] # Schema: [ [title, score], [title, score]... ]
    
    for plot in cleaned_plots:
        if plot == cleaned_plots[id]:
            continue

        score = cleaned_plots[id][1].similarity(plot[1]) # Get Similarity between current movie and match movie

        matches.append([plot[0], score]) # Add movie title and score to end of list
        matches = sorted(matches, key=lambda x: x[1], reverse=True) # Sort list by scores
        if len(matches) > 10: # Remove lowest score if length > 10
            matches.pop()
    
    end_time = process_time()
    # Print Results
    print("The best matches for", cleaned_plots[id][0], "are:")
    i = 1
    for match in matches:
        print(str(i) + ".", match[0], "\t", match[1])
        i+=1

    print("\nTime to find best match from ", len(cleaned_plots), " films: ", end_time - start_time)
    print("Average search time per film: ", (end_time-start_time)/len(cleaned_plots))

top10_match_cleaned_plots(3331)

In [None]:

# Take Plots[10] and Plots[100] and remove the stopwords
# TODO: Find more efficient way to remove stopwords
# TODO: Remove stopwords from ALL plots

# Create blank token list
first_token_list = []
# Tokenize plots - add each word to list
for token in plots[10][1]:
    first_token_list.append(token.text)

# Create a new string combining all the tokens (words) that are not stopwords
first_filtered_plot_str = ' '.join([str(token) for token in first_token_list if not nlp.vocab[token].is_stop])

# Repeat stopword removal for second plot
second_token_list = []
for token in plots[100][1]:
    second_token_list.append(token.text)      

second_filtered_plot_str = ' '.join([str(token) for token in second_token_list if not nlp.vocab[token].is_stop])

# Print standard simularity with stopwords
simularity_printer(10, 100)

# Print Filtered Stopword similarity to test stopword removal impact
print("Filtered Stopwords: ", nlp(first_filtered_plot_str).similarity(nlp(second_filtered_plot_str)))



In [None]:
# Experiments with bag of words for keyword group mining
from bitarray import bitarray
from apyori import apriori


lexicon = {}
for plot in plots:
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct and not token.text == "\n":
            if token.lemma_ not in lexicon.keys():
                lexicon[token.lemma_] = 1
            else:
                lexicon[token.lemma_] += 1

print(list(lexicon)[0:25])
print(len(lexicon))

plot_word_bags = []
plot_word_lists = []

for plot in plots:
    plot_words = set()
    plot_words_list = []
    for token in plot[1]:
        if not nlp.vocab[token.text].is_stop and not nlp.vocab[token.text].is_punct:
            plot_words.add(token.lemma_)
    word_bag = bitarray()
    for word in lexicon:
        if word in plot_words:
            word_bag.append(1)
            plot_words_list.append(word)
        else:
            word_bag.append(0)
    plot_word_bags.append(word_bag)
    plot_word_lists.append(plot_words_list)

plot_num = 400
#print(plots[plot_num][0])
#print(plot_word_bags[plot_num])
#for i in range(0, len(lexicon)):
#    if plot_word_bags[plot_num][i]:
#        print(list(lexicon.keys())[i])

#Ignore compact bit vectors for now, see what happens with an out of the box apriori algorithm
results = list(apriori(plot_word_lists, min_support = .1))
print("Apriori Test")
print("Total results: ", len(results))
for result in results:
    print(result.items)

In [None]:
def supportSorter(e):
  return e.support

results.sort( key=supportSorter)

for result in results:
  print(result.items, result.support)

In [None]:
from collections import defaultdict
import csv

rated_movies = set()
rated_movies_dict = {}
#movie_ratings = set()
movie_viewers = defaultdict(lambda: list())
viewer_movies = defaultdict(lambda: list())


with open("movies_metadata.csv", 'r', encoding='utf-8') as movie_meta_data:
    movie_reader = csv.reader(movie_meta_data)
    next(movie_reader, None)  # Skip the csv header row

    for row in movie_reader:
        #print(row[5])
        #print(row[8])
        rated_movies.add((row[5],row[8]))
        rated_movies_dict[row[8]]=row[5]

#userId,movieId,rating,timestamp
with open("ratings.csv", 'r', encoding='utf-8') as movie_ratings_data:
    ratings_reader = csv.reader(movie_ratings_data)
    next(ratings_reader, None)
    count = 0

    for row in ratings_reader:
        count = count + 1
        if(count % 1000000 == 0):
            print(count, " ratings processed")
       # movie_ratings.add((row[0],row[1],row[2]))
        movie_viewers[row[1]].append((row[0],row[1],float(row[2])))
       # viewer_movies[row[0]].add((row[0],row[1],row[2]))
    print("Finished processing all ", count, " ratings")



In [None]:
#Given a movie return the count of both how many people who liked the first movie liked the second movie
# and how many people who liked the first movie didn't like the second movie
def rating_comparison(base_movie_name, comparison_movie_name):
    base_movie_id = rated_movies_dict[base_movie_name]
    comparison_movie_id=rated_movies_dict[comparison_movie_name]
    print(base_movie_id)
    print(comparison_movie_id)
    good_match = 0
    bad_match = 0
    for base_viewer in movie_viewers[base_movie_id]:
        if(base_viewer[2] >= 4):
            for comparison_viewer in movie_viewers[comparison_movie_id]:
                if(comparison_viewer[0] == base_viewer[0]):
                    if(comparison_viewer[2] >= 4):
                        good_match = good_match + 1
                    elif(comparison_viewer[2] <= 3):
                        bad_match = bad_match +1
                    break
    print("Good Matches: ", good_match)
    print("Bad Matches: ", bad_match)
    return(good_match, bad_match)


In [None]:
rating_comparison("The Matrix", "The Fifth Element")