In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load('en_core_web_sm')

In [2]:
file_one = "Resources/wiki_movie_plots.csv"

In [3]:
df = pd.read_csv(file_one)

In [4]:
df = df.head(20000)

In [5]:
len(df)

20000

In [6]:
nostopwords = []
for i in range (0, 20000):
    doc = nlp(df.Plot[i])
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    nostopwords.append(tokens)
df['NoStopwords'] = nostopwords
df

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Unnamed: 8,NoStopwords
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",,"[bartender, work, saloon, serve, drink, custom..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",,"[moon, paint, smile, face, hang, park, night, ..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",,"[film, minute, long, compose, shot, girl, sit,..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,,"[last, 61, second, consist, shot, shot, set, w..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,,"[early, know, adaptation, classic, fairytale, ..."
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab...",,"[Alice, follow, large, white, rabbit, rabbit, ..."
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...,,"[film, open, bandit, break, railroad, telegrap..."
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,,"[film, family, suburb, hope, quiet, life, thin..."
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...,,"[opening, scene, show, interior, robber, den, ..."
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....,,"[scene, introduce, line, poem.[2, Santa, Claus..."


In [7]:
for i in range (len(df)):
    df['NoStopwords'][i] = " ".join(df['NoStopwords'][i])

df.to_csv("NoStopwords.csv", index=False)

df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Unnamed: 8,NoStopwords
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",,bartender work saloon serve drink customer fil...
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",,moon paint smile face hang park night young co...
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",,film minute long compose shot girl sit base al...
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,,last 61 second consist shot shot set wood wint...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,,early know adaptation classic fairytale film s...


In [8]:
#Enter the movie plot description to be compared with the database here

userplot_raw = "Teenager Miles Morales struggles to live up to the expectations of his father, police officer Jefferson Davis, who sees Spider-Man as a menace. Miles transfers to a boarding school, but later sneaks out and goes to his uncle Aaron Davis's house. When he takes Miles to an abandoned subway station to paint graffiti, Miles is bitten by a radioactive spider and gains spider-like abilities."

In [9]:
userplot = []    
doc = nlp(userplot_raw)
tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
userplot.append(tokens)

In [10]:
for i in range (len(userplot)):
    userplot = " ".join(userplot[0])
    
userplot

'Teenager Miles Morales struggle live expectation father police officer Jefferson Davis see Spider Man menace Miles transfer board school later sneak go uncle Aaron Davis house take Miles abandon subway station paint graffiti Miles bite radioactive spider gain spider like ability'

In [11]:
newdf = df[~df.Plot.isna()]

In [12]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(newdf.NoStopwords)
tfidf

<20000x82438 sparse matrix of type '<class 'numpy.float64'>'
	with 2741587 stored elements in Compressed Sparse Row format>

In [13]:
newdf['tfidf'] = tfidf

In [14]:
newdf.iloc[0]['tfidf'][0]

<1x82438 sparse matrix of type '<class 'numpy.float64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [15]:
similarity = tfidf[0]*vectorizer.transform([userplot]).T

In [16]:
sims = []
for i in range (len(newdf)):
    similarity = tfidf[i]*vectorizer.transform([userplot]).T
    sims.append(similarity[0,0])
newdf['Similarities'] = sims

In [17]:
topfive_full = newdf.sort_values(by='Similarities', ascending=False).head(5)

In [18]:
topfive = topfive_full[['Release Year', 'Title', 'Director', 'Genre', 'Plot', 'Similarities']]

In [19]:
topfive

Unnamed: 0,Release Year,Title,Director,Genre,Plot,Similarities
13429,1999,Blue Streak,Les Mayfield,"comedy, crime",Jewel thief Miles Logan participates in a $17 ...,0.493438
11741,1991,True Identity,Charles Lane,unknown,A struggling black actor named Miles Pope is o...,0.409468
10202,1984,Electric Dreams,Steve Barron,comedy,Miles Harding is an architect who envisions a ...,0.382537
14544,2004,Sideways,Alexander Payne,comedy,"Miles Raymond is an unsuccessful writer, a win...",0.369727
9538,1978,Silent Partner,Daryl Duke,crime drama,"Miles Cullen (Elliott Gould), a bored teller a...",0.358788
