In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credit = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.info()
credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
# merge the both dataset to movies dataset
movies=movies.merge(credit, on= 'title')
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [5]:
# select the relevent columns only in dataset
movies=movies[['id','title','overview','genres','keywords','cast','crew']]
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.6+ KB


In [6]:
# check the null columns
movies.isnull().sum()

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [7]:
# drop the null columns
movies.dropna(inplace=True)

In [8]:
# check duplicate values
movies.duplicated().sum()

0

In [9]:
movies.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [10]:
# function to get list from the dictionary field
import ast

def fetch_list(text):
    l=[]
    for i in ast.literal_eval(text):
        l.append(i['name'])
        
    return l

In [11]:
movies['genres']=movies['genres'].apply(fetch_list)
movies['keywords']=movies['keywords'].apply(fetch_list)

In [12]:
# get original name of charecters in the movies
import ast

def fetch_cast(text):
    l=[]
    counter=0
    for i in ast.literal_eval(text):
        if counter < 3:
            l.append(i['name'])
        counter +=1
        
    return l

In [13]:
movies['cast']=movies['cast'].apply(fetch_cast)

In [14]:
# get director name from the crew column
import ast

def fetch_dir(text):
    l=[]
    for i in ast.literal_eval(text):
        if i['job']=='Director':
            l.append(i['name'])
            break
        
    return l

In [15]:
movies['crew']=movies['crew'].apply(fetch_dir)

In [16]:
movies.iloc[0]

id                                                      19995
title                                                  Avatar
overview    In the 22nd century, a paraplegic Marine is di...
genres          [Action, Adventure, Fantasy, Science Fiction]
keywords    [culture clash, future, space war, space colon...
cast         [Sam Worthington, Zoe Saldana, Sigourney Weaver]
crew                                          [James Cameron]
Name: 0, dtype: object

In [17]:
# overview
movies.iloc[0]['overview']

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [18]:
# make overview column as key or tag so we can compare esily(vector form)
movies['overview']=movies['overview'].apply(lambda x : x.split())

In [19]:
# updated overview
movies.iloc[0]['overview']

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.']

In [20]:
# remove space between names(there may be multiple name in cast or crew)
# Sam Worthington == SamWorthington
def remove_space(word):
    l=[]
    for i in word:
        l.append(i.replace(" ",""))
    
    return l


In [21]:
movies['crew']=movies['crew'].apply(remove_space)
movies['cast']=movies['cast'].apply(remove_space)
movies['genres']=movies['genres'].apply(remove_space)
movies['keywords']=movies['keywords'].apply(remove_space)

In [22]:
movies.iloc[0]

id                                                      19995
title                                                  Avatar
overview    [In, the, 22nd, century,, a, paraplegic, Marin...
genres           [Action, Adventure, Fantasy, ScienceFiction]
keywords    [cultureclash, future, spacewar, spacecolony, ...
cast            [SamWorthington, ZoeSaldana, SigourneyWeaver]
crew                                           [JamesCameron]
Name: 0, dtype: object

In [23]:
# concatinate all the columns to one column : tag
movies['tag']=movies['overview']+movies['genres']+movies['cast']+movies['crew']+movies['keywords']

In [24]:
movies.iloc[0]['tag']

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.',
 'Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'JamesCameron',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d']

In [25]:
# get three columns from movies to up_movies
up_movies=movies[['id','title','tag']]

In [26]:
# convert tag column(list) to string formate(paragraph)
up_movies['tag']=up_movies['tag'].apply(lambda x : " ".join(x))
up_movies.iloc[0]['tag']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  up_movies['tag']=up_movies['tag'].apply(lambda x : " ".join(x))


'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction SamWorthington ZoeSaldana SigourneyWeaver JamesCameron cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d'

In [27]:
# replace all the upper case letter to lower  case letters
up_movies['tag']=up_movies['tag'].apply(lambda x : x.lower()) 
up_movies.iloc[0]['tag']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  up_movies['tag']=up_movies['tag'].apply(lambda x : x.lower())


'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction samworthington zoesaldana sigourneyweaver jamescameron cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d'

In [28]:
import nltk
from nltk.stem import PorterStemmer

In [29]:
ps =PorterStemmer()

In [30]:
def stems(text):
    l=[]
    for i in text.split():
        l.append(ps.stem(i))
    return " ".join(l)

In [31]:
up_movies['tag']=up_movies['tag'].apply(stems)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  up_movies['tag']=up_movies['tag'].apply(stems)


In [32]:
up_movies.iloc[0]['tag']

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict samworthington zoesaldana sigourneyweav jamescameron cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d'

In [33]:
# eliminate  extra english words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words= 'english')

In [34]:
vector = cv.fit_transform(up_movies['tag']).toarray()

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(vector)
sim

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [38]:
def recommend(movie):
    index =up_movies[up_movies['title']==movie].index[0]
    distance=sorted(list(enumerate(sim[index])),reverse=True , key=lambda x : x[1])
    for i in distance[1:6]:
        print(up_movies.iloc[i[0]].title)
    

In [42]:
x = input("Enter any movie name to get recommondation\n\n\n\n\n")
recommend(x)

Enter any movie name to get recommondation




Avatar
Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [45]:
import pickle

pickle.dump(up_movies,open('artificats/movies_list.pkl', 'wb'))
pickle.dump(sim,open('artificats/similarity.pkl', 'wb'))