Recommendation techniques:
    1) Content based filtering - using description/plot for recommeding similar stuff
    2) Collaborative filtering - using users past behavior for recommending

# Importing packages and dataset

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('IMDB_Top250movies2_OMDB_Detailed.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings.Source,Ratings.Value,Metascore,imdbRating,imdbVotes,imdbID,Type,tomatoMeter,tomatoImage,tomatoRating,tomatoReviews,tomatoFresh,tomatoRotten,tomatoConsensus,tomatoUserMeter,tomatoUserRating,tomatoUserReviews,tomatoURL,DVD,BoxOffice,Production,Website,Response
0,1,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,"Crime, Drama",Frank Darabont,"Stephen King (short story ""Rita Hayworth and S...","Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...,English,USA,Nominated for 7 Oscars. Another 19 wins & 30 n...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.3/10,80.0,9.3,1825626,tt0111161,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/shawshank_rede...,27 Jan 1998,,Columbia Pictures,,True
1,2,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo (screenplay), Francis Ford Coppola ...","Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",USA,Won 3 Oscars. Another 23 wins & 27 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.2/10,100.0,9.2,1243444,tt0068646,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather/,09 Oct 2001,,Paramount Pictures,http://www.thegodfather.com,True
2,3,The Godfather: Part II,1974,R,20 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola (screenplay), Mario Puzo ...","Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",USA,Won 6 Oscars. Another 10 wins & 20 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,85.0,9.0,856870,tt0071562,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/godfather_part...,24 May 2005,,Paramount Pictures,http://www.thegodfather.com/,True
3,4,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan (screenplay), Christopher Nolan...","Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...,"English, Mandarin","USA, UK",Won 2 Oscars. Another 151 wins & 153 nominations.,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,9.0/10,82.0,9.0,1802351,tt0468569,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/the_dark_knight/,09 Dec 2008,"$533,316,061",Warner Bros. Pictures/Legendary,http://thedarkknight.warnerbros.com/,True
4,5,12 Angry Men,1957,APPROVED,01 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,"Reginald Rose (story), Reginald Rose (screenplay)","Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...,English,USA,Nominated for 3 Oscars. Another 16 wins & 8 no...,https://images-na.ssl-images-amazon.com/images...,Internet Movie Database,8.9/10,96.0,8.9,494215,tt0050083,movie,,,,,,,,,,,http://www.rottentomatoes.com/m/1000013-12_ang...,06 Mar 2001,,Criterion Collection,http://www.criterion.com/films/27871-12-angry-men,True


In [3]:
len(data)

250

In [4]:
data.Plot[0]

'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.'

# Data Preprocessing

In [5]:
# convert lowercase and remove numbers, punctuations, spaces, etc.,
data['clean_plot'] = data['Plot'].str.lower()
data['clean_plot'] = data['clean_plot'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
data['clean_plot'] = data['clean_plot'].apply(lambda x: re.sub('\s+', ' ', x))
data['clean_plot']

0      two imprisoned men bond over a number of years...
1      the aging patriarch of an organized crime dyna...
2      the early life and career of vito corleone in ...
3      when the menace known as the joker emerges fro...
4      a jury holdout attempts to prevent a miscarria...
                             ...                        
245    blacksmith will turner teams up with eccentric...
246    a former child star torments her paraplegic si...
247    travis henderson an aimless drifter who has be...
248    desperate measures are taken by a man who trie...
249    a stranger in the city asks questions no one h...
Name: clean_plot, Length: 250, dtype: object

In [6]:
# tokenize the sentence
data['clean_plot'] = data['clean_plot'].apply(lambda x: nltk.word_tokenize(x))
data['clean_plot']

0      [two, imprisoned, men, bond, over, a, number, ...
1      [the, aging, patriarch, of, an, organized, cri...
2      [the, early, life, and, career, of, vito, corl...
3      [when, the, menace, known, as, the, joker, eme...
4      [a, jury, holdout, attempts, to, prevent, a, m...
                             ...                        
245    [blacksmith, will, turner, teams, up, with, ec...
246    [a, former, child, star, torments, her, parapl...
247    [travis, henderson, an, aimless, drifter, who,...
248    [desperate, measures, are, taken, by, a, man, ...
249    [a, stranger, in, the, city, asks, questions, ...
Name: clean_plot, Length: 250, dtype: object

In [7]:
# remove stopwords
stop_words = nltk.corpus.stopwords.words('english')
plot = []
for sentence in data['clean_plot']:
    temp = []
    for word in sentence:
        if word not in stop_words and len(word) >= 3:
            temp.append(word)
    plot.append(temp)
plot

[['two',
  'imprisoned',
  'men',
  'bond',
  'number',
  'years',
  'finding',
  'solace',
  'eventual',
  'redemption',
  'acts',
  'common',
  'decency'],
 ['aging',
  'patriarch',
  'organized',
  'crime',
  'dynasty',
  'transfers',
  'control',
  'clandestine',
  'empire',
  'reluctant',
  'son'],
 ['early',
  'life',
  'career',
  'vito',
  'corleone',
  'new',
  'york',
  'portrayed',
  'son',
  'michael',
  'expands',
  'tightens',
  'grip',
  'family',
  'crime',
  'syndicate'],
 ['menace',
  'known',
  'joker',
  'emerges',
  'mysterious',
  'past',
  'wreaks',
  'havoc',
  'chaos',
  'people',
  'gotham',
  'dark',
  'knight',
  'must',
  'accept',
  'one',
  'greatest',
  'psychological',
  'physical',
  'tests',
  'ability',
  'fight',
  'injustice'],
 ['jury',
  'holdout',
  'attempts',
  'prevent',
  'miscarriage',
  'justice',
  'forcing',
  'colleagues',
  'reconsider',
  'evidence'],
 ['german',
  'occupied',
  'poland',
  'world',
  'war',
  'oskar',
  'schindler',


In [8]:
data['clean_plot'] = plot


In [9]:
data['clean_plot']

0      [two, imprisoned, men, bond, number, years, fi...
1      [aging, patriarch, organized, crime, dynasty, ...
2      [early, life, career, vito, corleone, new, yor...
3      [menace, known, joker, emerges, mysterious, pa...
4      [jury, holdout, attempts, prevent, miscarriage...
                             ...                        
245    [blacksmith, turner, teams, eccentric, pirate,...
246    [former, child, star, torments, paraplegic, si...
247    [travis, henderson, aimless, drifter, missing,...
248    [desperate, measures, taken, man, tries, save,...
249    [stranger, city, asks, questions, one, asked, ...
Name: clean_plot, Length: 250, dtype: object

In [10]:
data['Genre'] = data['Genre'].apply(lambda x: x.split(','))
data['Actors'] = data['Actors'].apply(lambda x: x.split(',')[:4])
data['Director'] = data['Director'].apply(lambda x: x.split(','))

In [11]:
data['Actors'][0]

['Tim Robbins', ' Morgan Freeman', ' Bob Gunton', ' William Sadler']

In [12]:
# removing spaces and coverting to lowercase
def clean(sentence):
    temp = []
    for word in sentence:
        temp.append(word.lower().replace(' ', ''))
    return temp

In [13]:
data['Genre'] = [clean(x) for x in data['Genre']]
data['Actors'] = [clean(x) for x in data['Actors']]
data['Director'] = [clean(x) for x in data['Director']]

In [14]:
data['Actors'][0]

['timrobbins', 'morganfreeman', 'bobgunton', 'williamsadler']

In [15]:
# combining all the columns data
columns = ['clean_plot', 'Genre', 'Actors', 'Director']
combined = []
for i in range(len(data)):
    words = ''
    for col in columns:
        words += ' '.join(data[col][i]) + ' '
    combined.append(words)

data['combined_input'] = combined

In [16]:
data1 = data[['Title', 'combined_input']]
data1.head()

Unnamed: 0,Title,combined_input
0,The Shawshank Redemption,two imprisoned men bond number years finding s...
1,The Godfather,aging patriarch organized crime dynasty transf...
2,The Godfather: Part II,early life career vito corleone new york portr...
3,The Dark Knight,menace known joker emerges mysterious past wre...
4,12 Angry Men,jury holdout attempts prevent miscarriage just...


# Feature Extraction

Applying TFIDF Vectorization to combined input

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf = TfidfVectorizer()
features = tfidf.fit_transform(data1['combined_input'])

In [19]:
# create cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(features, features)
print(cosine_sim)

[[1.         0.02678083 0.02577716 ... 0.03951546 0.02753552 0.00336651]
 [0.02678083 1.         0.17542882 ... 0.00345231 0.05241316 0.0034233 ]
 [0.02577716 0.17542882 1.         ... 0.04315335 0.07138058 0.00329501]
 ...
 [0.03951546 0.00345231 0.04315335 ... 1.         0.02507045 0.00338774]
 [0.02753552 0.05241316 0.07138058 ... 0.02507045 1.         0.00351977]
 [0.00336651 0.0034233  0.00329501 ... 0.00338774 0.00351977 1.        ]]


# Movie Recommendation

In [20]:
index = pd.Series(data1['Title'])
index.head()

0    The Shawshank Redemption
1               The Godfather
2      The Godfather: Part II
3             The Dark Knight
4                12 Angry Men
Name: Title, dtype: object

In [21]:

def recommend_movies(title):
    movies = []
    idx = index[index == title].index[0]
    # print(idx)
    score = pd.Series(cosine_sim[idx]).sort_values(ascending=False)
    top10 = list(score.iloc[1:11].index)
    # print(top10)
    
    for i in top10:
        movies.append(data1['Title'][i])
    return movies

In [22]:
recommend_movies('The Dark Knight Rises')

['The Dark Knight',
 'Inception',
 'Batman Begins',
 'The Lord of the Rings: The Fellowship of the Ring',
 'Die Hard',
 'Star Wars: Episode IV - A New Hope',
 'The Prestige',
 'Drishyam',
 'Metropolis',
 'Mad Max: Fury Road']

In [24]:
index[index == 'The Dark Knight Rises'].index[0]

62

In [25]:
pd.Series(cosine_sim[3]).sort_values(ascending=False)

3      1.000000
113    0.193484
62     0.192231
47     0.127716
243    0.073168
         ...   
28     0.000000
115    0.000000
114    0.000000
223    0.000000
78     0.000000
Length: 250, dtype: float64

In [28]:
recommend_movies('The Godfather')

['The Godfather: Part II',
 'Apocalypse Now',
 'On the Waterfront',
 'Scarface',
 'Casino',
 'Wild Strawberries',
 'Rashomon',
 'Heat',
 'All About Eve',
 'The 400 Blows']