In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher

In [2]:
def simplify_text(s):
    return re.sub('[\'\\u2019\\u0027]', '', re.sub('[^a-zA-Z0-9:;,&$!?\'\\u2019]', ' ', s))

In [3]:
def get_matches(doc, debug = False):
    m_list = []
    to_remove = []
    
    matches = matcher(doc)
    
    if debug:
        print(doc.text)

    for i, match in enumerate(matches):
        m_start = match[1]
        m_end = match[2]
        m_list.append((m_start, m_end))
        
        if debug:
            print(match)
            print(doc[m_start : m_end], end = ' ')

        if(len(m_list) > 1):
            if m_list[-1][0] >= m_list[-2][0] and m_list[-1][0] <= m_list[-2][1]:
                if abs(m_list[-1][1] - m_list[-1][0]) > abs(m_list[-2][1]-m_list[-2][0]):
                    if debug:
                        print("REMOVE ABOVE", end ='')
                    to_remove.append(i-1)
                else:
                    if debug:
                        print("REMOVE THIS", end = '')
                    to_remove.append(i)

        if debug:
            print('')

    for i in to_remove[::-1]:
        matches.pop(i)

    return matches

---

In [4]:
nlp = spacy.load('en_core_web_lg')
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x20080d01fa8>)

In [5]:
movies = pd.read_csv('./movies_data/movies.csv').drop(columns = ['Unnamed: 0'])
requests = pd.read_csv('./data/moviesuggestions_data.csv')
comments = pd.read_csv('./movies_data/comments_data.csv')

In [6]:
titles_list = []
titles_lower = []
ids_list = []

for i, titles in enumerate(list(movies['titles'])):
    split_titles = titles.split('%')
    titles_list += split_titles
    
    for _ in range(len(split_titles)):
        ids_list.append(movies['id'][i])
    
for title in titles_list:
    titles_lower.append(simplify_text(title).lower())

For now, the system will return only the most recent movie when multiple movies share the same name. Further down the road, one goal may be to identify different movies with the same name (when possible) such as '101 Dalmations' 1961 or 1996. When comments include the year this will be possible, though it may not be easy. However, I will prepare the dictionaries with title years now for use later.

In [7]:
# https://www.geeksforgeeks.org/python-convert-two-lists-into-a-dictionary/
name_dict = {titles_lower[i] : ids_list[i] for i in range(len(titles_lower))}
name_dict_with_dups = {}

for i in range(len(titles_lower)-1, -1, -1):
    # https://stackoverflow.com/a/18552025
    if titles_lower[i] in list(name_dict_with_dups):
        name_dict_with_dups[titles_lower[i] + ' ' + str(movies[movies['id'] == ids_list[i]]['year'].values[0])] = ids_list[i]
    else:
        name_dict_with_dups[titles_lower[i]] = ids_list[i]
        

name_dict_dups_only = {i : name_dict_with_dups[i] for i in list(name_dict_with_dups) if i not in list(name_dict)}

In [8]:
patterns = [nlp.make_doc(text) for text in titles_lower]

matcher = PhraseMatcher(nlp.vocab)
matcher.add("MovieTitles", None, *patterns)

In [9]:
#Only get top-level comments: those comments whose "parent" is the submission, not another comment
comments = comments[comments['parent_id'] == comments['link_id']]
comments.reset_index(inplace=True)

In [10]:
comments_simple = comments.copy()
comments_simple['body'] = comments_simple['body'].apply(simplify_text)

---

In [11]:
com1 = nlp(comments_simple['body'][558].lower())

In [12]:
com1.text

' blue is the warmest color  https:  letterboxd com film blue is the warmest color     our little sister  https:  letterboxd com film our little sister     blindspotting  https:  letterboxd com film blindspotting     lord of the rings  https:  letterboxd com film the lord of the rings the fellowship of the ring     whiplash  https:  letterboxd com film whiplash 2014     burning  https:  letterboxd com film burning 2018     oldboy  https:  letterboxd com film oldboy     perfect blue  https:  letterboxd com film perfect blue     your name  https:  letterboxd com film your name     my neighbor totoro  https:  letterboxd com film my neighbor totoro     the lure  https:  letterboxd com film the lure     spider man: into the spider verse  https:  letterboxd com film spider man into the spider verse  '

In [None]:
#MOVIES TO REMOVE:
#It, Them, Fun, Romance

In [None]:
#Comment 558 can't match "howl's moving castle"

In [13]:
# There is a bot that compiles reccomendations. That could be very useful for me! But for now, I need to ignore it. Later, I can compare what movies I found to what the bot found.
# author = 'cinephilebot'
flag = '   Replying after taking permission from mod'

ignore_list = []

for i, comment in enumerate(comments_simple['body']):
    if flag == comment[:len(flag)]:
        ignore_list.append(i)

In [14]:
matched_title = com1[matches[0][1] : matches[0][2]]

movies[movies.id == name_dict[matched_title.text]]

NameError: name 'matches' is not defined

In [15]:
# A little more cleaning to get ready to make the new dataframe: requests + suggestions

comments_simple = comments_simple[comments_simple['is_submitter'] == False]

comments_simple['link_id'] = comments_simple['link_id'].str[3:]

comments_simple = comments_simple[['author', 'score', 'body', 'link_id', 'is_submitter']]

requests = requests[['id', 'created_utc', 'title', 'selftext']]

In [16]:
suggestions = pd.DataFrame(columns = ['title', 'selftext', 'movie_ids'])
suggestions

Unnamed: 0,title,selftext,movie_ids


In [65]:
sugg_dict = {link_id : [] for link_id in requests['id']}

for i in comments_simple.index:
    row = comments_simple[comments_simple.index == i]
    if (row['score'].values[0] <= 0 )or (i in ignore_list):
        continue
    
    body = nlp(row['body'].values[0].lower())
    link_id = row['link_id'].values[0]
    
    matches = get_matches(body)
    
    for match in matches:
        matched_title = body[match[1] : match[2]]
        try:
            sugg_dict[link_id].append(name_dict[matched_title.text])
        except:
            print(f"Failed match: {matched_title.text}, comment index {i}")

Failed match: you re next, comment index 67
Failed match: id, comment index 220
Failed match: 8, comment index 284
Failed match: id, comment index 382
Failed match: id, comment index 412
Failed match: id, comment index 430
Failed match: 8, comment index 458
Failed match: id, comment index 546
Failed match: your name, comment index 558
Failed match: your name, comment index 558
Failed match: 8, comment index 598
Failed match: your name, comment index 699
Failed match: w, comment index 774
Failed match: w, comment index 774
Failed match: id, comment index 782
Failed match: mulholland dr, comment index 847
Failed match: the good,the bad and the ugly, comment index 863
Failed match: your name, comment index 869
Failed match: your name, comment index 902
Failed match: 8, comment index 926
Failed match: e t, comment index 931
Failed match: id, comment index 1071
Failed match: adaptation, comment index 1131
Failed match: adaptation, comment index 1131
Failed match: adaptation, comment index 1

In [66]:
sugg_dict

{'ixcr5j': ['tt0069467',
  'tt0068646',
  'tt0078875',
  'tt0139239',
  'tt0067805',
  'tt0075675',
  'tt0071411',
  'tt0074102',
  'tt0071360',
  'tt0073486',
  'tt0450345',
  'tt0075314',
  'tt0071315',
  'tt0066921',
  'tt0077711',
  'tt0069704',
  'tt0120655'],
 'ixbv9u': [],
 'ixbsb4': [],
 'ixbj02': [],
 'ixamkm': [],
 'ix8z2h': ['tt0901507'],
 'ix7ua1': [],
 'ix7pg9': [],
 'ix4sim': [],
 'ix4kni': [],
 'ix3boe': [],
 'ix38d4': [],
 'ix1dl7': [],
 'ix13j8': ['tt0048491', 'tt0060176', 'tt0090315', 'tt0808417'],
 'iwz4bu': ['tt0436364', 'tt3464902'],
 'iwygdm': ['tt0082010',
  'tt0449061',
  'tt1181614',
  'tt0069050',
  'tt0063049',
  'tt0048452',
  'tt0437179',
  'tt1596365'],
 'iwyb6y': [],
 'iwxuhm': ['tt0317705',
  'tt6139732',
  'tt0034583',
  'tt0118715',
  'tt0155975',
  'tt0118771',
  'tt0365748',
  'tt0338013',
  'tt0259711',
  'tt0137523',
  'tt0086856',
  'tt0119654',
  'tt0093773',
  'tt0181689',
  'tt0120601',
  'tt5027774',
  'tt4550098',
  'tt0105151',
  'tt0109830'

In [20]:
post_id = requests[55:56]['id'].values[0]
comments_simple[comments_simple['link_id'] == post_id]

Unnamed: 0,author,score,body,link_id,is_submitter
340,cinephile_bot,1,Replying after taking permission from mod ...,iw5yrd,False
423,whattheheckisdecaf,1,Dread 2009,iw5yrd,False
479,Randytheadventurer,1,"Dark &amp; Messed up; Split, Feast Cr...",iw5yrd,False
497,visibly_hangry,1,Dear Zachary,iw5yrd,False
508,mohantharani,1,The killing of a sacred deer,iw5yrd,False
509,Bruhm0ment084,4,Se7en and Prisoners are two good choices,iw5yrd,False
524,steelflexjones,2,Antichrist?,iw5yrd,False
525,StephenKeen,2,Annihilation,iw5yrd,False
527,R3dn3kH1ppy,2,Have you watched requiem for a dream,iw5yrd,False


In [38]:
requests

Unnamed: 0,id,created_utc,title,selftext
0,ixcr5j,2020-09-21 19:17:15,Movies from the '70s.,I'm in search of movies from the '70s. Non-En...
1,ixbv9u,2020-09-21 18:24:28,Style similar to Cinemania or Grey Gardens,I really liked how both of these documentaries...
2,ixbsb4,2020-09-21 18:19:43,Movie about people that suffer from avoidant p...,\nI dont know if theres any but basically Im ...
3,ixbj02,2020-09-21 18:04:06,Looking for movies with deep conversations in ...,Like the beginning of Predestination for insta...
4,ixamkm,2020-09-21 17:12:36,Looking For Time Loop Movies. Any Suggestions?,"Saw Happy Death Day 2 U, Blood Punch, Before I..."
...,...,...,...,...
305,iroswy,2020-09-12 20:05:26,Futuristic Sci-Fi movies,I’m really into movies that take place in the ...
306,iro0jk,2020-09-12 19:16:51,Movies about suicide; and/or a deep hallowing ...,I’m looking for a movie in which the main char...
307,irnvy8,2020-09-12 19:09:27,Cyberpunk/Synthwave/Thriller movies that are n...,"Well I really liked Upgrade/Blade Runner 2049,..."
308,irnlz8,2020-09-12 18:53:07,"Hi, I'm looking for hard sci fi movies",Anything that has to do with space and first e...
