In [None]:
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

In [2]:
# Importing Data
movies = pd.read_csv('./data/tmdb_5000_movies.csv')
credits = pd.read_csv('./data/tmdb_5000_credits.csv')

In [81]:
# We have too much data with same type so we Merge(combine together) the data which is Movies & Credits into one data in movies.
movies = movies.merge(credits, on='title')
# movies.head(0) # To View 1 st Entry

In [82]:
# Deleted the Attritubes which is not Used
# genres id keywords title Overview cast crew

movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
# movies.info() # information about new movie frame 
# movies.head() # return 1 st 5 entryies

In [83]:
# Returns if any attribute have any NULL value
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [84]:
# It drops all rows from the movies DataFrame that contain any missing (NaN) values, in-place
movies.dropna(inplace=True)

In [None]:
# Now checking duplicates in data or duplicate data
movies.duplicated().sum()

np.int64(0)

In [87]:
#[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'
#['Action','Adventure','FFantasy','SciFi']

def convert(obj):
    '''takes a stringified list of dictionaries (common in JSON-like columns in .csv files), 
    and extracts only the name field from each dictionary.'''
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

# Run this to see how it will look in string format
# ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [88]:
# Cast column :-
def convert3(obj):
    """In cast there is huge names (data) so we just consider first 
    3 actor names which is main actors to recommend movies by actors name"""
    L=[]
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
           L.append(i['name'])
           counter+=1
        else:
             break
    return L

In [89]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
           L.append(i['name'])
           break
    return L

In [90]:
# kept only the name of the genres removed id
movies['keywords'] = movies['keywords'].apply(convert)

# kept only the name of the genres removed id
movies['genres'] = movies['genres'].apply(convert)

# Only Top 3 Cast kept
movies['cast'] = movies['cast'].apply(convert3)

# movies['crew'][0] # Crew has many Values we extact only Director
movies['crew'] = movies['crew'].apply(fetch_director)

# movies['overview'][0].split() # Splits A Long Review into Words
movies['overview'] = movies['overview'].apply(lambda x:x.split())
# movies.head()['overview'] # Result

In [91]:
# Removing Spaces 
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","")for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","")for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","")for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","")for i in x])

In [92]:
# Now creat another column which is Tag
movies['tags'] = movies['overview']+ movies['genres']+ movies['keywords']+ movies['cast']+ movies['crew']
# movies.head() # To View new added Attribute

In [93]:
# now we don't need other columns so make new data frame
# All Data from Older Attributes Transfred to tags
filter_data = movies[['movie_id','title','tags']]

In [94]:
# Convert the Tags to String Again
filter_data['tags'] = filter_data['tags'].apply(lambda x:" ".join(x))
# filter_data.head() # View Output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_data['tags'] = filter_data['tags'].apply(lambda x:" ".join(x))


In [95]:
ps = PorterStemmer()

In [96]:
def stem(text):
    """['loved','loving','love'] it will give ['love','love','love']"""
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return" ".join(y)

In [97]:
filter_data['tags'] = filter_data['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_data['tags'] = filter_data['tags'].apply(stem)


In [98]:
filter_data['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [99]:
# convert this into lowercase
filter_data['tags'] = filter_data['tags'].apply(lambda x:x.lower())
# filter_data.head() # View Output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_data['tags'] = filter_data['tags'].apply(lambda x:x.lower())


In [100]:
# Turning words/sentences into numbers — so that machine learning models can understand and process them.
cv = CountVectorizer(max_features=5000,stop_words='english')

In [101]:
vectors = cv.fit_transform(filter_data['tags']).toarray()

In [104]:
# Calculating Similarity Vector using Cosine Simiarity techniqe
similarity = cosine_similarity(vectors)

In [105]:
# this how we calculated each movie's distance with each other with index
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:6]

[(1214, np.float64(0.28676966733820225)),
 (2405, np.float64(0.26901379342448517)),
 (3728, np.float64(0.2605130246476754)),
 (507, np.float64(0.255608593705383)),
 (539, np.float64(0.25038669783359574))]

In [106]:
# Movie Reccomender
def recommend(movie):
    movie_index = filter_data[filter_data['title'] == movie].index[0] 
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
       # print(i[0]) it shows only index numbers so we use another print()
        print(filter_data.iloc[i[0]].title) # this is use for names
        
    return 

# Try Reccomding Movie
# recommend('Avatar') # Uncomment this

In [None]:
os.makedirs("dumped_data", exist_ok=True)

# Dump Calulated Files
pickle.dump(filter_data,open('./dumpted_data/movies.pkl','wb'))
pickle.dump(similarity,open('./dumpted_data/similarity.pkl','wb'))
pickle.dump(filter_data.to_dict(),open('./dumpted_data/movie_dict.pkl','wb'))