In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

import spacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher

In [137]:
def simplify_text(s):
    return re.sub('[\'\\u2019\\u0027]', '', re.sub('[^a-zA-Z0-9:;,&$!?\'\\u2019]', ' ', s))

In [138]:
nlp = spacy.load('en_core_web_lg')

In [139]:
nlp.remove_pipe('ner')

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1c950a60708>)

In [140]:
movies = pd.read_csv('./movies_data/movies.csv').drop(columns = ['Unnamed: 0'])

In [141]:
requests = pd.read_csv('./data/moviesuggestions_data.csv')

In [142]:
comments = pd.read_csv('./movies_data/comments_data.csv')

In [174]:
titles_list = []
titles_lower = []

for titles in list(movies['titles']):
    titles_list += (titles.split('%'))
    
for title in titles_list:
    titles_lower.append(simplify_text(title).lower())

In [176]:
patterns = [nlp.make_doc(text) for text in titles_lower]

matcher = PhraseMatcher(nlp.vocab)
matcher.add("MovieTitles", None, *patterns)

In [177]:
#Only get top-level comments: those comments whose "parent" is the submission, not another comments
comments = comments[comments['parent_id'] == comments['link_id']]
comments.reset_index(inplace=True)

In [178]:
comments_wc = comments.copy()
comments_wc['body'] = comments_wc['body'].apply(simplify_text)

In [183]:
com1 = nlp(comments_wc['body'][558].lower())

In [184]:
com1.text

'howls moving castle'

In [185]:
matches = matcher(com1)

In [186]:
print(com1.text)
m_list = []
to_remove = []
for i, match in enumerate(matches):
    m_start = match[1]
    m_end = match[2]
    m_list.append((m_start, m_end))
    print(match)
    print(com1[m_start : m_end], end = ' ')
    
    if(len(m_list) > 1):
        if m_list[-1][0] >= m_list[-2][0] and m_list[-1][0] <= m_list[-2][1]:
            if abs(m_list[-1][1] - m_list[-1][0]) > abs(m_list[-2][1]-m_list[-2][0]):
                print("REMOVE ABOVE", end ='')
                to_remove.append(i-1)
            else:
                print("REMOVE THIS", end = '')
                to_remove.append(i)
    
    print('')
    
for i in to_remove[::-1]:
    matches.pop(i)
    
matches

howls moving castle
(3459089755582445566, 0, 3)
howls moving castle 
(3459089755582445566, 1, 2)
moving REMOVE THIS


[(3459089755582445566, 0, 3)]

In [100]:
#MOVIES TO REMOVE:
#It, Them, Fun, Romance

In [101]:
#Comment 558 can't match "howl's moving castle"

In [102]:
matches[1]

IndexError: list index out of range

In [103]:
for ent in com1.ents:
    print(ent.text, ent.label_)

In [104]:
titles_lower.sort()
titles_lower[6973]

'Husband Material'

In [105]:
comments['body'].apply(simplify_text)

0       3 Idiots  https:  www imdb com title tt118704...
1       Hobsons Choice  https:  www imdb com title tt...
2       Romancing The Stone  https:  www imdb com tit...
3       Secret Superstar  https:  www imdb com title ...
4       3 Idiots  https:  www imdb com title tt118704...
                             ...                        
566    Not a movie but the new Korean drama Record of...
567    You looking for those teen dystopian films ?  ...
568                             The Silence of the Lambs
569                                             removed 
570     Coming of Age  https:  www reddit com r Movie...
Name: body, Length: 571, dtype: object

In [None]:
temp_matcher = PhraseMatcher(nlp.vocab)
temp_matcher.add()

In [150]:
simplify_text(movies[movies.index == 5698]['titles'].values[0].split('%')[0])

'Howls Moving Castle'

In [106]:
movies[movies['year'] == 2004].sort_values(by = 'rating', ascending = False)[:10]

Unnamed: 0,id,titles,year,rating
6265,tt0416960,Marmoulak%The Lizard,2004,8.5
6133,tt0400234,Black Friday,2004,8.5
5641,tt0338013,Eternal Sunshine of the Spotless Mind%Eterno r...,2004,8.3
5850,tt0367110,"Swades%Swades: We, the People",2004,8.2
6390,tt0428870,Nae meorisokui jiwoogae%A Moment to Remember,2004,8.2
5698,tt0347149,Howl's Moving Castle%Hauru no ugoku shiro,2004,8.2
5802,tt0363163,Downfall%Der Untergang,2004,8.2
6094,tt0395169,Hotel Rwanda,2004,8.1
6173,tt0405159,Rope Burns%Million Dollar Baby,2004,8.1
5987,tt0381681,Before Sunset%Untitled Before Sunrise Sequel%B...,2004,8.1


In [151]:
s = 'Howl’s Moving Castle'

In [152]:
simplify_text(s)

'Howls Moving Castle'

In [154]:
comments['body'][558]

'Howl’s Moving Castle'

In [110]:
'\'`'

"'`"

In [155]:
comments_wc['body'][558]

'Howls Moving Castle'

In [156]:
simplify_text(comments['body'][558])

'Howls Moving Castle'

In [158]:
titles_lower[6937]

'Space Jam'

In [157]:
titles_lower[6937] == simplify_text(comments['body'][558]).lower()

False

In [163]:
matches = matcher(nlp("i like howls moving castle to space jam se7en watch"))

In [164]:
matches

[]

In [125]:
simplify_text('l\'âge d\'or')

'l ge dor'

In [116]:
patterns

[the birth of a nation,
 in the clutches of the ku klux klan,
 the birth of the nation; or the clansman,
 the clansman,
 intolerance: a sun-play of the ages,
 intolerance,
 the mother and the law,
 intolerance: love's struggle throughout the ages,
 shoulder arms,
 broken blossoms,
 broken blossoms or the yellow man and the girl,
 the cabinet of dr. caligari,
 das cabinet des dr. caligari,
 the golem: how he came into the world,
 the golem,
 der golem, wie er in die welt kam,
 the kid,
 the waif,
 the phantom carriage,
 körkarlen,
 destiny,
 the weary death,
 der müde tod,
 between worlds,
 between two worlds,
 beyond the wall,
 dr. mabuse, der spieler,
 dr. mabuse, the gambler part one: the great gamble,
 the fatal passion,
 dr. mabuse the gambler,
 dr. mabuse, the gambler. part two: inferno,
 the witches,
 häxan: witchcraft through the ages,
 häxan,
 nanook of the north,
 nosferatu, eine symphonie des grauens,
 nosferatu,
 nosferatu, a symphony of horror,
 our hospitality,
 safety las