In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# movies_df = pd.read_csv('./data/my data/movies_dataset.csv')
movies_df = pd.read_pickle('./data/my data/movies_dataset.pk')
crew_df = pd.read_csv('./data/my data/crew_dataset.csv', dtype={'id':np.int32,'crew_id':np.int32,}, index_col='id')
cast_df = pd.read_csv('./data/my data/cast_dataset.csv', dtype={'id':np.int32,'cast_id':np.int32,}, index_col='id')
keywords_df = pd.read_csv('./data/my data/keywords_dataset.csv', dtype={'id':np.int32,}, index_col=['id'])

In [4]:
from ast import literal_eval

In [5]:
movies_df = pd.merge(movies_df,crew_df,on='id')
movies_df = pd.merge(movies_df,cast_df,on='id')
movies_df = pd.merge(movies_df,keywords_df,on='id')

In [6]:
def genre2sentence(item):
    if isinstance(item, list):
        genres = []
        for genre in item:
            genres.append('genre_'+ str(genre))
        return str(genres)
    return []
movies_df['genres'] = movies_df['genre_ids'].apply(genre2sentence)

In [7]:
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(literal_eval)

In [8]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['department'] == 'Directing':
            return i['name']
    return np.nan

In [9]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        if len(x)==0:
            return []
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [10]:
# Define new director, cast, genres and keywords features that are in a suitable form.
movies_df['director'] = movies_df['crew'].apply(get_director)

features = ['cast', 'keywords',]
for feature in features:
    movies_df[feature] = movies_df[feature].apply(get_list)
    print(feature,end=' ')

cast keywords 

In [11]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movies_df[feature] = movies_df[feature].apply(clean_data)

In [12]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
movies_df['soup'] = movies_df.apply(create_soup, axis=1)

In [13]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies_df['soup'])

In [14]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# Reset index of our main DataFrame and construct reverse mapping as before
movies_df = movies_df.reset_index()
indices = pd.Series(movies_df.index, index=movies_df['title'])

In [20]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[0]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [16]:
movies_df[ movies_df['title'].str.contains('Royal Bengal') ]

Unnamed: 0,index,genre_ids,id,original_language,overview,popularity,release_date,title,vote_average,vote_count,crew,cast,keywords,genres,director,soup
8594,8594,[],262918,bn,Abhi is a typical Bengali guy who has a happy ...,1.282,2014-01-31,The Royal Bengal Tiger,6.7,3,[],[],[],[],,
8638,8638,[53],206793,bn,"Feluda alias Pradosh Mitra, a private detectiv...",1.533,2011-12-15,Royal Bengal Rahasya,7.2,12,"[{'crew_id': 12160, 'name': 'Satyajit Ray', 'd...","[sabyasachichakraborty, sahebbhattacharjee, bi...",[],[genre_53],sandipray,sabyasachichakraborty sahebbhattacharjee bibh...


In [18]:
movies_df.iloc[8638]['soup']

' sabyasachichakraborty sahebbhattacharjee bibhubhattacharya sandipray genre_53'

In [21]:
sub_df = pd.DataFrame([[movies_df.iloc[8638]['soup']]], columns=['soup'])
# sub_df
sub_df['soup']
sub_mat = count.transform( sub_df['soup'] )
sub_cosine_sim2 = cosine_similarity(sub_mat, count_matrix)
get_recommendations('Royal Bengal Rahasya', sub_cosine_sim2)

8454      Gorosthane Sabdhan
8553     Kailashey Kelenkari
8552       Tintorettor Jishu
8676       Bombaiyer Bombete
8773           Double Feluda
8691    Golapi Mukta Rahasya
8777          Baksha Rahasya
8545      Gosainpur Sargaram
8758       Mayurkanthi Jelly
8705          Badshahi Angti
Name: title, dtype: object

In [24]:
print(count_matrix.shape, sub_mat.shape)

(9407, 18551) (1, 18551)


In [31]:
import pickle

In [None]:
def pickle_read(fileName):
    with open(fileName,'rb') as file_opened:
        return pickle.load(file=file_opened)

def pickle_write(fileName, obj):
    with open(fileName,'wb') as file_opened:
        try:
            pickle.dump(obj=obj, file=file_opened)
        except:
            print('unable to write')

In [None]:
def save_as_pickle():
    pickle_write(fileName='./processed/obj_v2/description_matrix.pk', obj=count_matrix)
    pickle_write(fileName='./processed/obj_v2/cosine_sim.pk', obj=cosine_sim2)
    pickle_write(fileName='./processed/obj_v2/movies_df.pk',obj=movies_df)
    pickle_write(fileName='./processed/obj_v2/indices.pk',obj=indices)

In [28]:
count_matrix

<9407x18551 sparse matrix of type '<class 'numpy.int64'>'
	with 79400 stored elements in Compressed Sparse Row format>

In [32]:
with open('count_matrix.pk','wb') as file_opened:
        # try:
        pickle.dump(obj=count_matrix, file=file_opened)
        # except:
            # print('unable to write')