In [3]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
pd.options.mode.chained_assignment = None 



In [4]:
movies = pd.read_csv('archive/tmdb_movies_data.csv')

In [5]:
movies.head(2)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0


In [6]:
movies = movies[['id', 'original_title', 'cast', 'director', 'overview', 'genres']]

In [7]:
movies.head(2)

Unnamed: 0,id,original_title,cast,director,overview,genres
0,135397,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Twenty-two years after the events of Jurassic ...,Action|Adventure|Science Fiction|Thriller
1,76341,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,An apocalyptic story set in the furthest reach...,Action|Adventure|Science Fiction|Thriller


In [8]:
print(movies.isnull().sum())
movies.dropna(inplace=True)
movies.reset_index(drop=True, inplace=True)

id                 0
original_title     0
cast              76
director          44
overview           4
genres            23
dtype: int64


In [9]:
def str_to_list(col):
    for i in range(len(movies[col])):
        if("|" in movies[col].iloc[i]):
            changed_val = movies[col].iloc[i].split('|') #chained assignment warning, try to remove but later not now
            movies[col].iloc[i] = changed_val
        elif(col == 'director'):
            movies[col].iloc[i] = [movies[col].iloc[i]]
        else:
            changed_val = movies[col].iloc[i].split(' ') #chained assignment warning, try to remove but later not now
            movies[col].iloc[i] = changed_val
        


In [10]:
str_to_list('cast')
str_to_list('director')
str_to_list('overview')
str_to_list('genres')

In [11]:
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['director'] = movies['director'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['name'] = movies['original_title'].apply(lambda x: x.lower())
movies['name'] = movies['name'].apply(lambda x: re.sub(r'\W+', '', x))

In [12]:
movies.head(2)

Unnamed: 0,id,original_title,cast,director,overview,genres,name
0,135397,Jurassic World,"[ChrisPratt, BryceDallasHoward, IrrfanKhan, Vi...",[ColinTrevorrow],"[Twenty-two, years, after, the, events, of, Ju...","[Action, Adventure, ScienceFiction, Thriller]",jurassicworld
1,76341,Mad Max: Fury Road,"[TomHardy, CharlizeTheron, HughKeays-Byrne, Ni...",[GeorgeMiller],"[An, apocalyptic, story, set, in, the, furthes...","[Action, Adventure, ScienceFiction, Thriller]",madmaxfuryroad


In [13]:
movies['Details'] = movies['cast'] + movies['genres'] + movies['overview'] + movies['director'] 

In [14]:
movies = movies[['id', 'original_title', 'name', 'Details']]
movies.head(5)


Unnamed: 0,id,original_title,name,Details
0,135397,Jurassic World,jurassicworld,"[ChrisPratt, BryceDallasHoward, IrrfanKhan, Vi..."
1,76341,Mad Max: Fury Road,madmaxfuryroad,"[TomHardy, CharlizeTheron, HughKeays-Byrne, Ni..."
2,262500,Insurgent,insurgent,"[ShaileneWoodley, TheoJames, KateWinslet, Anse..."
3,140607,Star Wars: The Force Awakens,starwarstheforceawakens,"[HarrisonFord, MarkHamill, CarrieFisher, AdamD..."
4,168259,Furious 7,furious7,"[VinDiesel, PaulWalker, JasonStatham, Michelle..."


In [16]:
# porterStemmer = PorterStemmer()
word_lemmatizer = WordNetLemmatizer()
for index, row in movies.iterrows():
    val = row['Details']
    # print(val)
    stemmed = []
    for i in val:
        stemmed.append(word_lemmatizer.lemmatize(i))
    movies.at[index, 'Details'] = stemmed

In [17]:
movies['Details'] = movies['Details'].apply(lambda x: ' '.join(x))
movies['Details'][0]

"ChrisPratt BryceDallasHoward IrrfanKhan VincentD'Onofrio NickRobinson Action Adventure ScienceFiction Thriller Twenty-two year after the event of Jurassic Park, Isla Nublar now feature a fully functioning dinosaur theme park, Jurassic World, a originally envisioned by John Hammond. ColinTrevorrow"

In [18]:
len(movies)

10730

In [19]:
tf_idf = TfidfVectorizer(stop_words='english', max_features=10730)
vectors = tf_idf.fit_transform(movies['Details'])

In [20]:
print(type(vectors))
vectors = vectors.toarray()
vectors.shape

<class 'scipy.sparse.csr.csr_matrix'>


(10730, 10730)

In [21]:
tf_idf.get_feature_names()[100:120]



['22',
 '23',
 '24',
 '25',
 '25th',
 '26',
 '27',
 '28',
 '30',
 '300',
 '30th',
 '35',
 '3d',
 '40',
 '400',
 '47',
 '48',
 '4th',
 '50',
 '500']

In [22]:
similarity = cosine_similarity(vectors)


In [23]:
print(similarity.shape)
print(similarity[0])
print(list(enumerate(similarity[0])))
print(sorted(list(enumerate(similarity[0])), key= lambda x: x[1], reverse=True))

(10730, 10730)
[1.         0.03639989 0.02641172 ... 0.         0.01224062 0.        ]
[(0, 1.0000000000000002), (1, 0.036399893611597775), (2, 0.026411718115667834), (3, 0.036847517924578244), (4, 0.012920012003941852), (5, 0.016213886092334), (6, 0.07752175812702009), (7, 0.015762693661107), (8, 0.017524510719773515), (9, 0.0), (10, 0.013120993316243085), (11, 0.02444395469011581), (12, 0.02841427510626108), (13, 0.01589674616884965), (14, 0.030241371308102998), (15, 0.0), (16, 0.00869782920854362), (17, 0.026048505554448414), (18, 0.0), (19, 0.010572443990977103), (20, 0.02143567441773466), (21, 0.00417094246793237), (22, 0.012534589979490269), (23, 0.0), (24, 0.0), (25, 0.006930292216818695), (26, 0.0), (27, 0.013602546878282237), (28, 0.005030973605658291), (29, 0.017839572629022235), (30, 0.008426991083590213), (31, 0.0362104770341054), (32, 0.0), (33, 0.00918304689797824), (34, 0.010875269760096786), (35, 0.027311289617910916), (36, 0.01778588148436708), (37, 0.10765071695544237

In [24]:
def movie_recommender(movie_name):
    index = movies[movies['name'] == movie_name].index[0]
    distance = similarity[index]
    recommended_list = sorted(list(enumerate(distance)), key= lambda x: x[1], reverse=True)[1:11]

    for i in recommended_list:
        print(movies['original_title'].iloc[i[0]])

movie = 'X-Men'.lower()
movie = re.sub(r'\W+', '', movie)
movie_recommender(movie)

X2
X-Men: Days of Future Past
X-Men: The Last Stand
The Wolverine
X-Men Origins: Wolverine
Hulk vs. Wolverine
Swordfish
Mission: Impossible - Rogue Nation
Men in Black
Teenage Mutant Ninja Turtles II: The Secret of the Ooze


In [None]:
import pickle
pickle.dump(movies.to_dict(), open('recommender/frontend/pickles/movies.pkl', 'wb')) #move this to the frontend directory
pickle.dump(similarity, open('recommender/frontend/pickles/similarity.pkl', 'wb')) #move this to the frontend directory


In [25]:
print(movies['name'].iloc[1882])

aliceinwonderland
