In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('imdb_top_1000.csv')

In [3]:
df.shape

(1000, 16)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [5]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.isna().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [8]:
df['Released_Year'].value_counts()

Released_Year
2014    32
2004    31
2009    29
2013    28
2016    28
        ..
1920     1
1930     1
1922     1
1943     1
PG       1
Name: count, Length: 100, dtype: int64

In [9]:
df['Released_Year'] = df['Released_Year'].replace("PG", 1995)

In [10]:
def era_class(y):
    if y < 1970:
        return 'Classic'
    elif 1970 <= y < 1990:
        return 'Golden Age'
    elif 1990 <= y < 2010:
        return 'Modern'
    else:
        return 'Contemporary'

In [11]:
df['era_class'] = df['Released_Year'].astype(int).apply(era_class)

In [12]:
df['Certificate'] = df['Certificate'].fillna(df['Certificate'].mode()[0])

Concat relevant columns into one column

In [13]:
relevant_features = ['Series_Title', 'era_class', 'Certificate', 'Genre','IMDB_Rating','Director','Star1','Star2','Star3','Star4' ]

In [14]:
X = df[relevant_features].astype(str).agg(' '.join, axis=1)

In [15]:
pd.set_option('display.max_colwidth', None)

In [16]:
X

0                          The Shawshank Redemption Modern A Drama 9.3 Frank Darabont Tim Robbins Morgan Freeman Bob Gunton William Sadler
1                         The Godfather Golden Age A Crime, Drama 9.2 Francis Ford Coppola Marlon Brando Al Pacino James Caan Diane Keaton
2             The Dark Knight Modern UA Action, Crime, Drama 9.0 Christopher Nolan Christian Bale Heath Ledger Aaron Eckhart Michael Caine
3            The Godfather: Part II Golden Age A Crime, Drama 9.0 Francis Ford Coppola Al Pacino Robert De Niro Robert Duvall Diane Keaton
4                                  12 Angry Men Classic U Crime, Drama 9.0 Sidney Lumet Henry Fonda Lee J. Cobb Martin Balsam John Fiedler
                                                                      ...                                                                 
995      Breakfast at Tiffany's Classic A Comedy, Drama, Romance 7.6 Blake Edwards Audrey Hepburn George Peppard Patricia Neal Buddy Ebsen
996                        

Text Preprocessing

In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from string import punctuation
from nltk.corpus import stopwords
stop = stopwords.words('english')
def preprocess(text):
    tokens = ''.join([i for i in text.lower() if i not in punctuation])
    tokens_without_stop = [lemmatizer.lemmatize(i, pos='v') for i in tokens.split() if i not in stop]
    return " ".join(tokens_without_stop)

In [18]:
X_cleaned = X.apply(preprocess)

In [19]:
X_cleaned

0                       shawshank redemption modern drama 93 frank darabont tim robbins morgan freeman bob gunton william sadler
1                         godfather golden age crime drama 92 francis ford coppola marlon brando al pacino jam caan diane keaton
2          dark knight modern ua action crime drama 90 christopher nolan christian bale heath ledger aaron eckhart michael caine
3           godfather part ii golden age crime drama 90 francis ford coppola al pacino robert de niro robert duvall diane keaton
4                           12 angry men classic u crime drama 90 sidney lumet henry fonda lee j cobb martin balsam john fiedler
                                                                 ...                                                            
995     breakfast tiffanys classic comedy drama romance 76 blake edwards audrey hepburn george peppard patricia neal buddy ebsen
996                          giant classic g drama western 76 george stevens elizabeth taylor roc

Vectorization

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = TfidfVectorizer()
X_vector = vectorizer.fit_transform(X_cleaned)
X_vector.shape

(1000, 5764)

In [21]:
data = {}
for movie, vector in zip(df['Series_Title'], X_vector.toarray()):
    data[movie] = vector

In [22]:
data

{'The Shawshank Redemption': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'The Godfather': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'The Dark Knight': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'The Godfather: Part II': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 '12 Angry Men': array([0.29243535, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ], shape=(5764,)),
 'The Lord of the Rings: The Return of the King': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'Pulp Fiction': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 "Schindler's List": array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'Inception': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'Fight Club': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'The Lord of the Rings: The Fellowship of the Ring': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'Forrest Gump': array([0., 0., 0., ..., 0., 0., 0.], shape=(5764,)),
 'Il buon

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(
	np.array(data['The Godfather']).reshape(1, -1),
	np.array(data['The Godfather: Part II']).reshape(1, -1)
)[0][0]  # Trial run

np.float64(0.5560965326222664)

In [24]:
def recommend_movie(film, movie_dictionary, n=5):
    if film in movie_dictionary:
        dummy_dict = {}
        film_vector = movie_dictionary[film]
        for m, v in movie_dictionary.items():
            if m!=film:
                cosine = cosine_similarity([film_vector],[v])[0][0]
                dummy_dict[m] = cosine
        lst = sorted(dummy_dict.items(), key=lambda x:x[1], reverse=True)[:n] # sorted movies with cosine similarity in descending order and returned only movie names.
        return [i[0] for i in lst] 
    else:
        return 'No movie found'     

In [25]:
recommend_movie('The Godfather: Part II', data)

['The Godfather',
 'The Godfather: Part III',
 'Apocalypse Now',
 'A Bronx Tale',
 'The Irishman']

In [26]:
file = {'movie_dictionary':data, 'df':df}
import pickle
with open('movie_reco.pkl', 'wb') as obj1:
    pickle.dump(file,obj1)