In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher

In [176]:
#This function uses regular expressions to do three things:
#    1. replace URLs starting with https:// (like the links posted in many suggestions) with a space
#    2. replace non-necessary charcaters with spaces. These are more often used for post formatting than titles, but can get in the way of detecting titles
#    3. remove apostrohpes - do not replace with space. People are lazy with apostrophes and often forget them, or put them in the wrong places.

def simplify_text(s):
    return re.sub('[\'\\u2019\\u0027]', '', re.sub('[^a-zA-Z0-9:;.,&$!?\'\\u2019]', ' ', re.sub('https://[=?\-_./\w]+', ' ', s)))

In [273]:
#This function takes a list of strings and makes them one string separated by '%' character
#This is used to apply() to the suggested title ids column for easy splitting in other notebooks

def list_to_string(s_list):
    s_str = ''

    for text in s_list:
        s_str += text + "%"
        
    if len(s_str) == 0:
        return np.nan
    
    return s_str[:-1]

In [175]:
re.sub('https://[=?\-_./\w]+', ' ', 'i like this SITE: https://google.com/s?v=7812349-name-you/.html it is a good site')

'i like this SITE:   it is a good site'

In [103]:
def get_matches(doc, debug = False):
    m_list = []
    to_remove = []
    
    matches = matcher(doc)
    
    if debug:
        print(doc.text)

    for i, match in enumerate(matches):
        m_start = match[1]
        m_end = match[2]
        m_list.append((m_start, m_end))
        
        if debug:
            print(match)
            print(doc[m_start : m_end], end = ' ')
            
        #This code block removes false positives of movie titles that appear within other movie titles
        #For example, "The Mask" would appear in "Son of The Mask" but the latter would be the true recommendation

        if(len(m_list) > 1):
            if m_list[-1][0] >= m_list[-2][0] and m_list[-1][0] <= m_list[-2][1]:
                if abs(m_list[-1][1] - m_list[-1][0]) > abs(m_list[-2][1]-m_list[-2][0]):
                    if debug:
                        print("REMOVE ABOVE", end ='')
                    to_remove.append(i-1)
                else:
                    if debug:
                        print("REMOVE THIS", end = '')
                    to_remove.append(i)

        if debug:
            print('')

    for i in to_remove[::-1]:
        matches.pop(i)

    return matches

---

In [86]:
nlp = spacy.load('en_core_web_lg')
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x26388aff6a8>)

In [239]:
movies = pd.read_csv('./movies_data/movies.csv').drop(columns = ['Unnamed: 0'])
requests = pd.read_csv('./data/moviesuggestions_data.csv')
comments = pd.read_csv('./movies_data/comments_data.csv')

In [178]:
titles_list = []
titles_lower = []
ids_list = []

for i, titles in enumerate(list(movies['titles'])):
    split_titles = titles.split('%')
    titles_list += split_titles
    
    for _ in range(len(split_titles)):
        ids_list.append(movies['id'][i])
    
for title in titles_list:
    titles_lower.append(simplify_text(title).lower())

For now, the system will return only the most recent movie when multiple movies share the same name. Further down the road, one goal may be to identify different movies with the same name (when possible) such as '101 Dalmations' 1961 or 1996. When comments include the year this will be possible, though it may not be easy. However, I will prepare the dictionaries with title years now for use later.

In [179]:
# https://www.geeksforgeeks.org/python-convert-two-lists-into-a-dictionary/
name_dict = {titles_lower[i] : ids_list[i] for i in range(len(titles_lower))}
name_dict_with_dups = {}

for i in range(len(titles_lower)-1, -1, -1):
    # https://stackoverflow.com/a/18552025
    if titles_lower[i] in list(name_dict_with_dups):
        name_dict_with_dups[titles_lower[i] + ' ' + str(movies[movies['id'] == ids_list[i]]['year'].values[0])] = ids_list[i]
    else:
        name_dict_with_dups[titles_lower[i]] = ids_list[i]
        

name_dict_dups_only = {i : name_dict_with_dups[i] for i in list(name_dict_with_dups) if i not in list(name_dict)}

In [180]:
patterns = [nlp.make_doc(text) for text in titles_lower]

matcher = PhraseMatcher(nlp.vocab)
matcher.add("MovieTitles", None, *patterns)

In [181]:
#Only get top-level comments: those comments whose "parent" is the submission, not another comment
comments = comments[comments['parent_id'] == comments['link_id']]
comments.reset_index(inplace=True)

In [182]:
comments_simple = comments.copy()
comments_simple['body'] = comments_simple['body'].apply(simplify_text)

---

In [183]:
#MOVIES TO REMOVE:
#It, Them, Fun, Romance

In [184]:
#Comment 558 can't match "howl's moving castle"

In [185]:
# There is a bot that compiles reccomendations. That could be very useful for me! But for now, I need to ignore it. Later, I can compare what movies I found to what the bot found.
# author = 'cinephilebot'
flag = '   Replying after taking permission from mod'

ignore_list = []

for i, comment in enumerate(comments_simple['body']):
    if flag == comment[:len(flag)]:
        ignore_list.append(i)

matched_title = com1[matches[0][1] : matches[0][2]]

movies[movies.id == name_dict[matched_title.text]]

In [240]:
# A little more cleaning to get ready to make the new dataframe: requests + suggestions

comments_simple = comments_simple[comments_simple['is_submitter'] == False]

comments_simple['link_id'] = comments_simple['link_id'].str[3:]

comments_simple = comments_simple[['author', 'score', 'body', 'link_id', 'is_submitter']]

requests = requests[['id', 'created_utc', 'title', 'selftext']]

In [187]:
suggestions = pd.DataFrame(columns = ['title', 'selftext', 'movie_ids'])
suggestions

Unnamed: 0,title,selftext,movie_ids


In [211]:
#Create a dictionary with keys = request posts, and values = list of suggestions by commenters

#POSSIBLE CHANGES:
#   1. make a list of tuples instead of strings, each tuple containing a suggested title_ID and post score - to find quality suggestions
#   2. narrow down the comments by only taking the top X, or using a higher value on the score filter

sugg_dict = {link_id : [] for link_id in requests['id']}

for i in comments_simple.index:
    row = comments_simple[comments_simple.index == i]
    if (row['score'].values[0] <= 0 ) or (i in ignore_list):
        continue
    
    body = nlp(row['body'].values[0].lower())
    link_id = row['link_id'].values[0]
    
    matches = get_matches(body)
    
    for match in matches:
        matched_title = body[match[1] : match[2]]
        try:
            sugg_dict[link_id].append(name_dict[matched_title.text])
        except:
            #print(f"Failed match: {matched_title.text}, comment index {i}")
            pass

In [215]:
# Examining the results:

count = 0
zero_count = 0
sugg_sum = 0
uniq_sum = 0
max_sugg = 0

for i in sugg_dict:
    count += 1
    sugg_list = sugg_dict[i]
    list_len = len(sugg_list)
    uniq_sum += len(set(sugg_list))
    sugg_sum += list_len
    if list_len == 0:
        zero_count += 1
    if list_len > max_sugg:
        max_sugg = list_len
    
print(f"There are an average of {round(sugg_sum/count, 1)} suggestions per request")
print(f"There are an average of {round(uniq_sum/count, 1)} unique suggestions per request")
print(f"Out of {count} requests, {zero_count} ({np.round(100*zero_count/count,1)}%) had 0 suggestions (that were found).")
print(f"The request with the most suggestions had {max_sugg} suggestions.")

There are an average of 27.9 suggestions per request
There are an average of 22.3 unique suggestions per request
Out of 310 requests, 35 (11.3%) had 0 suggestions (that were found).
The request with the most suggestions had 354 suggestions.


In [221]:
# cast the list to a set, and back to a list, to remove duplicate suggestions

idx_list = []
sugg_list_list = []

for i in sugg_dict:
    sugg_dict[i] = list(set(sugg_dict[i]))
    idx_list.append(i)
    sugg_list_list.append(sugg_dict[i])

In [155]:
name_dict['the good, the bad and the ugly']

'tt0060196'

In [205]:
post_id = requests[55:56]['id'].values[0]
comments_simple[comments_simple['link_id'] == post_id]

Unnamed: 0,author,score,body,link_id,is_submitter
340,cinephile_bot,1,Replying after taking permission from mod ...,iw5yrd,False
423,whattheheckisdecaf,1,Dread 2009,iw5yrd,False
479,Randytheadventurer,1,"Dark &amp; Messed up; Split, Feast Cr...",iw5yrd,False
497,visibly_hangry,1,Dear Zachary,iw5yrd,False
508,mohantharani,1,The killing of a sacred deer.,iw5yrd,False
509,Bruhm0ment084,4,Se7en and Prisoners are two good choices.,iw5yrd,False
524,steelflexjones,2,Antichrist?,iw5yrd,False
525,StephenKeen,2,Annihilation,iw5yrd,False
527,R3dn3kH1ppy,2,Have you watched requiem for a dream,iw5yrd,False


In [271]:
sugg_df = pd.DataFrame(columns= ['id', 'suggestions'])
sugg_df['id'] = idx_list
sugg_df['suggestions'] = sugg_list_list

requests_with_suggestions = requests.merge(right = sugg_df, left_on = 'id', right_on = 'id')

In [272]:
requests_with_suggestions

Unnamed: 0,id,created_utc,title,selftext,suggestions
0,ixcr5j,2020-09-21 19:17:15,Movies from the '70s.,I'm in search of movies from the '70s. Non-En...,"[tt0120655, tt0069467, tt0069704, tt0450345, t..."
1,ixbv9u,2020-09-21 18:24:28,Style similar to Cinemania or Grey Gardens,I really liked how both of these documentaries...,[]
2,ixbsb4,2020-09-21 18:19:43,Movie about people that suffer from avoidant p...,\nI dont know if theres any but basically Im ...,[]
3,ixbj02,2020-09-21 18:04:06,Looking for movies with deep conversations in ...,Like the beginning of Predestination for insta...,[]
4,ixamkm,2020-09-21 17:12:36,Looking For Time Loop Movies. Any Suggestions?,"Saw Happy Death Day 2 U, Blood Punch, Before I...",[]
...,...,...,...,...,...
305,iroswy,2020-09-12 20:05:26,Futuristic Sci-Fi movies,I’m really into movies that take place in the ...,"[tt0120201, tt0238380, tt0083658, tt0816692, t..."
306,iro0jk,2020-09-12 19:16:51,Movies about suicide; and/or a deep hallowing ...,I’m looking for a movie in which the main char...,"[tt1571249, tt5247022, tt0282698, tt0363589, t..."
307,irnvy8,2020-09-12 19:09:27,Cyberpunk/Synthwave/Thriller movies that are n...,"Well I really liked Upgrade/Blade Runner 2049,...","[tt1343727, tt1386703, tt3672742, tt0800369, t..."
308,irnlz8,2020-09-12 18:53:07,"Hi, I'm looking for hard sci fi movies",Anything that has to do with space and first e...,"[tt0094737, tt0075860, tt0436364, tt0183523, t..."


In [274]:
requests_with_suggestions['suggestions'] = requests_with_suggestions['suggestions'].apply(list_to_string)

In [275]:
requests_with_suggestions['suggestions']

0      tt0120655%tt0069467%tt0069704%tt0450345%tt0066...
1                                                    NaN
2                                                    NaN
3                                                    NaN
4                                                    NaN
                             ...                        
305    tt0120201%tt0238380%tt0083658%tt0816692%tt3659...
306    tt1571249%tt5247022%tt0282698%tt0363589%tt4034...
307    tt1343727%tt1386703%tt3672742%tt0800369%tt6998...
308    tt0094737%tt0075860%tt0436364%tt0183523%tt2798...
309                                            tt1007028
Name: suggestions, Length: 310, dtype: object

In [285]:
requests_with_suggestions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 279 entries, 0 to 309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           279 non-null    object
 1   created_utc  279 non-null    object
 2   title        279 non-null    object
 3   selftext     279 non-null    object
 4   suggestions  279 non-null    object
dtypes: object(5)
memory usage: 13.1+ KB


In [280]:
#Some posts don't have self-text. This is OK, but need to not have nan
requests_with_suggestions.where(requests_with_suggestions['selftext'].notna(), ' ', inplace = True)

In [284]:
#Drop all rows without suggestions
requests_with_suggestions.dropna(inplace=True)

In [286]:
requests_with_suggestions.to_csv('./movies_data/requests_with_suggestions.csv', index = False)

In [292]:
movies[movies['id'] == 'tt0071853']

Unnamed: 0,id,titles,year,rating
1446,tt0071853,Monty Python and the Holy Grail,1975,8.2
