### 1. Import necessary packages

In [1]:
import numpy as np
import pandas as pd
#import plotly

### 2. Reading the Data files

In [None]:
credits_df = pd.read_csv(r'data/credits.csv')
credits_df.head()

In [None]:
keywords_df = pd.read_csv(r'data/keywords.csv')
keywords_df.head()

In [None]:
links_df = pd.read_csv(r'data/links.csv')
links_df.head()

In [None]:
movies_metadata_df = pd.read_csv(r'data/movies_metadata.csv')
movies_metadata_df.head()

In [None]:
ratings_df = pd.read_csv(r'data/ratings.csv')
ratings_df.head()

### 3. Exploratory Data Analysis

#### 3.1 Restructuring Credits DataFrame

In [None]:
credits_df.info()

In [None]:
credits_df.loc[0,'cast']

In [None]:
import ast
a = ast.literal_eval(credits_df.loc[0,'cast'])

In [None]:
a[0]['name']

In [None]:
def extract_cast_names(cast_list):
    ''' This function returns list of names of top 4 cast from the string of list of dictionaries of cast details.
        If details are not found, it will return 'NA'. '''
    cast_names = []

    #converting to list from string
    cast_list = ast.literal_eval(cast_list)
    
    if len(cast_list) == 0:
        return 'NA'
    else:
        for i in range(len(cast_list)):
            name = cast_list[i]['name']
            cast_names.append(name)
            if i == 4:
                break
        return cast_names

In [None]:
extract_cast_names(credits_df.loc[0,'cast'])

In [None]:
credits_df['Cast_Names'] = credits_df['cast'].map(extract_cast_names)
credits_df.head()

In [None]:
credits_df.loc[0,'crew']

In [None]:
a = ast.literal_eval(credits_df.loc[0,'crew'])
a[0]

In [None]:
def extract_director_name(crew_list):
    ''' This function returns name of the director from the string of list of dictionaries of crew details '''

    #converting to list from string
    crew_list = ast.literal_eval(crew_list)
    
    if len(crew_list) == 0:
        return 'NA'
    else:
        for i in range(len(crew_list)):
            if crew_list[i]['job'] == 'Director':
                return crew_list[i]['name']
            

In [None]:
extract_director_name(credits_df.loc[661,'crew'])

In [None]:
credits_df['Director'] = credits_df['crew'].map(extract_director_name)
credits_df.head()

In [None]:
credits_df.drop(columns=['cast','crew'], inplace=True)
credits_df.info()

In [None]:
credits_df[credits_df['Director'].isnull()]

#### 3.2 Restructuring Keywords DataFrame

In [None]:
keywords_df.head()

In [None]:
keywords_df.loc[0, 'keywords']

In [None]:
def extract_keywords(keyword_list):
    '''This function returns list of keywords from the string of list of dictionaries of keywords.
        If keywords are not found, it will return 'NA'.'''
    keywords = []

    #converting to list from string
    keyword_list = ast.literal_eval(keyword_list)
    
    if len(keyword_list) == 0:
        return 'NA'
    else:
        for i in range(len(keyword_list)):
            word = keyword_list[i]['name']
            keywords.append(word)
        return keywords

In [None]:
extract_keywords(keywords_df.loc[0, 'keywords'])

In [None]:
keywords_df['Keywords'] = keywords_df['keywords'].map(extract_keywords)
keywords_df.head()

In [None]:
keywords_df['Keywords'] = keywords_df['Keywords'].apply(lambda x: [i.replace(" ",'') for i in x])

In [None]:
len(keywords_df[keywords_df['Keywords'] == 'NA'])

In [None]:
keywords_df = keywords_df[['id', 'Keywords']]

In [None]:
keywords_df.loc[0,'Keywords']

#### 3.3 Restructuring Ratings DataFrame

In [None]:
ratings_df.head()

In [None]:
ratings_df.info()

In [None]:
new_ratings_df = ratings_df.groupby(by='movieId').mean()[['rating']]
new_ratings_df.info()

In [None]:
new_ratings_df.reset_index(inplace=True)
new_ratings_df.info()

In [None]:
new_ratings_df.head()

#### 3.4 Exploring Metadata DataFrame

In [None]:
movies_metadata_df.head()

In [None]:
movies_metadata_df.info()

In [None]:
# Dropping columns having >50% null values
cols_to_drop = []
for column in movies_metadata_df.columns:
    if (len(movies_metadata_df[movies_metadata_df[column].isnull()==True]) / 45466) > 0.5:
        cols_to_drop.append(column)

cols_to_drop

In [None]:
cols_to_drop += ['adult','budget','poster_path','production_companies','production_countries','runtime','revenue','status','original_title','video',
                 'vote_average', 'vote_count']
cols_to_drop

In [None]:
new_metadata_df = movies_metadata_df.drop(columns=cols_to_drop)
new_metadata_df.info()

In [None]:
#searching for inconsistencies in the 'id' column
import re

new_metadata_df[new_metadata_df['id'].apply(lambda x : bool(re.match(r'^[0-9]+$', x))) == False]

In [None]:
# dropping the found inconsistencies
new_metadata_df.drop(index=[19730,29503,35587],inplace=True)

In [None]:
# changing the data type of 'id' to 'int'
new_metadata_df['id'] = new_metadata_df['id'].astype(int)

In [None]:
# changing data type of 'imdb_id' to string
new_metadata_df['imdb_id'] = new_metadata_df['imdb_id'].astype(str)

#### 3.5 Combining all DataFrames

In [None]:
credits_df.info()

In [None]:
new_metadata_df.info()

In [None]:
final_data = pd.merge(new_metadata_df, credits_df, on='id', how='inner')
final_data.info()

In [None]:
final_data = pd.merge(final_data, keywords_df, on='id', how='inner')
final_data.info()

In [None]:
final_data = final_data[['id', 'imdb_id', 'title','genres', 'overview', 'original_language',
                         'Director','Cast_Names','Keywords','release_date']]
final_data.head()

#### 3.6 Cleaning Final Data

##### i. Removing NULL samples

In [None]:
# Finding columns Having null Values
final_data.isnull().sum()

In [None]:
# Dropping NULL samples from 'Overview' column
final_data.drop(index = final_data[final_data['overview'].isnull()==True].index, inplace=True)

In [None]:
final_data.isnull().sum()

In [None]:
# Dropping NULL samples from 'Director' column
final_data.drop(index = final_data[final_data['Director'].isnull()==True].index, inplace=True)
final_data.isnull().sum()

In [None]:
# Dropping NULL samples from 'release_date' column
final_data.drop(index = final_data[final_data['release_date'].isnull()==True].index, inplace=True)
final_data.isnull().sum()

In [None]:
# Dropping NULL samples from 'original_language' column
final_data.drop(index = final_data[final_data['original_language'].isnull()==True].index, inplace=True)
final_data.isnull().sum()

##### ii. Cleaning Duplicate values 

In [None]:
final_data['id'].duplicated().sum()

In [None]:
# Dropping Duplicated samples from 'id' column
final_data.drop(index = final_data[final_data['id'].duplicated()==True].index, inplace=True)
final_data['id'].duplicated().sum()

In [None]:
final_data['imdb_id'].duplicated().sum()

In [None]:
# Dropping Duplicated samples from 'imdb_id' column
final_data.drop(index = final_data[final_data['imdb_id'].duplicated()==True].index, inplace=True)
final_data['id'].duplicated().sum()

In [None]:
# Checking for duplicate Titles
final_data['title'].duplicated().sum()

In [None]:
final_data[final_data['title']=='Sabrina']

In [None]:
# Dropping duplicates from 'Title' colum by keeping the first occurence.
final_data.drop_duplicates(['title'], keep='first',inplace=True)

##### Some more cleaning

In [None]:
# Extracting list of genres
def extract_genres(genre_list):
    ''' This function returns list of genres from the string of list of dictionaries of genre details.
        If details are not found, it will return 'NA'. '''
    genres = []

    #converting to list from string
    genre_list = ast.literal_eval(genre_list)
    
    if len(genre_list) == 0:
        return 'NA'
    else:
        for i in range(len(genre_list)):
            name = genre_list[i]['name']
            genres.append(name)
        return genres

In [None]:
final_data['genres'] = final_data['genres'].map(extract_genres)
final_data.head()

In [None]:
# viewing distribution of 'original language'
final_data['original_language'].value_counts()

In [None]:
# keeping movies only with english, french, japanese, itallian language
final_data = final_data[final_data['original_language'].isin(['en','fr','ja','it'])]

In [None]:
final_data.drop(index=final_data[final_data['genres'] == 'NA'].index, inplace=True)

In [None]:
final_data.drop(index=final_data[final_data['Director'] == 'NA'].index, inplace=True)

In [None]:
final_data.drop(index=final_data[final_data['Cast_Names'] == 'NA'].index, inplace=True)

In [None]:
final_data.drop(columns=['Keywords'], inplace=True)

In [None]:
# Changing the data type of 'release date' too date time
final_data['release_date'] = pd.to_datetime(final_data['release_date'], format='%Y-%m-%d', errors='coerce')

In [None]:
# Changing values of 'overview' from string to list of strings
final_data['overview'] = final_data['overview'].astype(str)
final_data.head()

In [None]:
final_data.info()

### 4. Preprocessing

In [None]:
final_data.head()

In [None]:
final_data['Director'] = final_data['Director'].astype(str)
final_data['original_language'] = final_data['original_language'].astype(str)

In [None]:
final_data.head()

In [None]:
# Removing white spaces from string

final_data['Cast_Names'] = final_data['Cast_Names'].apply(lambda x: [i.replace(" ",'') for i in x])
final_data['genres'] = final_data['genres'].apply(lambda x: [i.replace(" ",'') for i in x])
final_data['original_language'] = final_data['original_language'].apply(lambda x: [i.replace(" ",'') for i in x])
final_data['Director'] = final_data['Director'].apply(lambda x: [i.replace(" ",'') for i in x])

final_data.head()

In [None]:
final_data['Director'] = final_data['Director'].apply(lambda x: [''.join(x)])
final_data['original_language'] = final_data['original_language'].apply(lambda x: [''.join(x)])
final_data.head()

In [None]:
# Creating tags column
final_data['tags'] = final_data['genres'] + final_data['Cast_Names'] + final_data['Director'] + final_data['original_language'] + final_data['Keywords'] 
final_data.head()

In [None]:
ml_model_data = final_data[['id', 'imdb_id','title', 'tags','overview','release_date']]
ml_model_data.head()

In [None]:
# Exporting final dataframe as csv
ml_model_data.to_csv('ml_model_data_v1.csv')

### 5. Building basic nlp based model

In [None]:
ml_model_data = pd.read_csv('ml_model_data_v1.csv')

In [None]:
ml_model_data.head()

In [None]:
ml_model_data.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
ml_model_data.head()

In [None]:
# Changing all the tags to lower case
ml_model_data['tags'] = ml_model_data['tags'].apply(lambda x: x.lower())
ml_model_data['overview'] = ml_model_data['overview'].apply(lambda x: x.lower())

In [None]:
ml_model_data.head()

In [None]:
# applying stemming
from nltk import PorterStemmer
ps = PorterStemmer()

In [None]:
def stemming(txt):
    words = []
    
    for word in txt.split():
        words.append( ps.stem(word) )
    
    return ' '.join(words)

In [None]:
ml_model_data['overview'] = ml_model_data['overview'].apply(stemming)

In [None]:
ml_model_data.head()

In [None]:
ml_model_data['overview'] = ml_model_data['overview'].apply(lambda x: x.split())

In [None]:
import ast
ml_model_data['tags'] = ml_model_data['tags'].apply(lambda x: ast.literal_eval(x))

In [None]:
type(ml_model_data.loc[0,'tags'])

In [None]:
ml_model_data['tags'] = ml_model_data['tags'] + ml_model_data['overview']
ml_model_data.head()

In [None]:
ml_model_data.loc[0,'tags']

In [None]:
ml_model_data['tags'] = ml_model_data['tags'].apply(lambda x: " ".join(x))

In [None]:
ml_model_data.head()

In [None]:
# Creating a term-frequency matrix with 5,000 tokens

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(ml_model_data['tags']).toarray()

In [None]:
cv.get_feature_names()

In [None]:
# Calculating cosine similarity for the TF matrix
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(vectors)

In [None]:
similarity_matrix.shape

In [None]:
# The main function to make recommendations

def recommend_movies(movie_name):
    ''' This method prints 5 closest movies to the input movie_name '''
    
    # Extracting index of input movie from data
    if movie_name in ml_model_data['title'].tolist():
        
        movie_index = ml_model_data[ml_model_data['title'] == movie_name].index[0]
    
        # Calculating input movie's distances from all other movies in data
        dist_from_other_movies = similarity_matrix_comp[movie_index]

        # Finding top 5 closest movie indices
        recommendation_list = sorted(list(enumerate(dist_from_other_movies)), key=lambda x: x[1], reverse=True)[1:11]

        # Printing names of recommended movies
        recommends = []
        for i in range(10):
            recommends.append(ml_model_data.iloc[ recommendation_list[i][0] ]['title'])
        
        return recommends
    
    else:
        return "Movie not found in database!"
    

In [None]:
recommend_movies('Iron Man')

### 6. Exporting data

In [None]:
type(similarity_matrix)

In [None]:
print(similarity_matrix.nbytes)
similarity_matrix_comp = similarity_matrix.astype('float16')
similarity_matrix_comp.nbytes

In [16]:
import pickle

In [None]:
final_data = ml_model_data.drop(columns=['overview'])
pickle.dump(final_data.to_dict(), open('final_movie_data_dict.pkl','wb'))

In [None]:
pickle.dump(similarity_matrix_comp, open('similarity_matrix_comp.pkl','wb'), protocol=4)

In [17]:
similarity_matrix = pickle.load(open(r'A:\VS Code Python\Project Deployments\Movie Recommender\similarity_matrix_comp.pkl', 'rb'))

In [21]:
copy = similarity_matrix[0:5,0:5]

In [2]:
import pymongo
import pymongoarrow
from pymongo import MongoClient

In [5]:
client = MongoClient('localhost',27017)

#Let’s connect to a test database named ‘test_database’ and a test collection named ‘test_collection.’
db = client.test_database
col = db.test_collection

In [6]:
# let’s insert a few documents into the test_database.test_collection collection
from datetime import datetime

col.insert_many([
{'_id': 1, 'measure': 43, 'status':'active', 'installed_on': datetime(2022, 1, 8, 3, 43, 12)},
{'_id': 2, 'measure': 32, 'status':'active', 'installed_on': datetime(2022, 2, 2, 11, 43, 27)},
{'_id': 3, 'measure': 62, 'status':'inactive', 'installed_on': datetime(2022, 3, 12, 3, 53, 12)},
{'_id': 4, 'measure': 59, 'status':'active', 'installed_on': datetime(2022, 4, 8, 3, 22, 45)}
])

<pymongo.results.InsertManyResult at 0x163b707a948>

In [7]:
# let’s verify that the data has been successfully written to your database
import pprint
for doc in col.find({}):
   pprint.pprint(doc)

{'_id': 1,
 'installed_on': datetime.datetime(2022, 1, 8, 3, 43, 12),
 'measure': 43,
 'status': 'active'}
{'_id': 2,
 'installed_on': datetime.datetime(2022, 2, 2, 11, 43, 27),
 'measure': 32,
 'status': 'active'}
{'_id': 3,
 'installed_on': datetime.datetime(2022, 3, 12, 3, 53, 12),
 'measure': 62,
 'status': 'inactive'}
{'_id': 4,
 'installed_on': datetime.datetime(2022, 4, 8, 3, 22, 45),
 'measure': 59,
 'status': 'active'}


In [8]:
# use PyMongoArrow’s functionality directly to Collection instances of PyMongo.
from pymongoarrow.monkey import patch_all
patch_all()

In [10]:
#  PyMongoArrow’s find_pandas_all() function to export MongoDB results set into Pandas DataFrame.
df = col.find_pandas_all({'measure': {'$gt': 40}}) # {'measure': {'$gt': 40}}
df

Unnamed: 0,_id,measure,status,installed_on
0,1,43,active,2022-01-08 03:43:12
1,3,62,inactive,2022-03-12 03:53:12
2,4,59,active,2022-04-08 03:22:45


In [11]:
# use PyMongoArrow’s find_numpy_all() function to export MongoDB results set into NumPy Array. 
np_array = col.find_numpy_all({})
np_array

{'_id': array([1, 2, 3, 4]),
 'measure': array([43, 32, 62, 59]),
 'status': array(['active', 'active', 'inactive', 'active'], dtype='<U8'),
 'installed_on': array(['2022-01-08T03:43:12.000', '2022-02-02T11:43:27.000',
        '2022-03-12T03:53:12.000', '2022-04-08T03:22:45.000'],
       dtype='datetime64[ms]')}

In [15]:
type(np_array)

dict

#### Importing Data from Pandas DataFrame into MongoDB
Importing data from Pandas DataFrame is clear using PyMongoArrow’s write() function. Write (collection, tabular) function takes two arguments:

collection – Name of the collection in which you want to write the data.
tabular – which is an instance of result.ArrowWriteResult. It could be your pandas dataframe, NumPy ndarray, or Arrow Table

Let’s import the ‘write’ function and invoke it. We will pass it two arguments i.e. ‘the name of the collection where we want to write the data and the “dataframe which we want to write to MongoDB.” We will reuse the ‘df’ DataFrame we created in the previous example.

In [12]:
from pymongoarrow.api import write
write(db.pandas_data, df)

{'insertedCount': 3}

In [14]:
for doc in db.pandas_data.find({}):
    pprint.pprint(doc)

{'_id': 1,
 'installed_on': datetime.datetime(2022, 1, 8, 3, 43, 12),
 'measure': 43,
 'status': 'active'}
{'_id': 3,
 'installed_on': datetime.datetime(2022, 3, 12, 3, 53, 12),
 'measure': 62,
 'status': 'inactive'}
{'_id': 4,
 'installed_on': datetime.datetime(2022, 4, 8, 3, 22, 45),
 'measure': 59,
 'status': 'active'}


In [42]:
# Importing Data from NumPy Array into MongoDB
copy_df = pd.DataFrame(copy)
copy_df


Unnamed: 0,0,1,2,3,4
0,1.0,0.09491,0.05014,0.063293,0.150391
1,0.09491,1.0,0.06604,0.027771,0.022018
2,0.05014,0.06604,1.0,0.088074,0.046509
3,0.063293,0.027771,0.088074,1.0,0.058685
4,0.150391,0.022018,0.046509,0.058685,1.0


In [43]:
copy_df.columns = copy_df.columns.astype('str')

In [48]:
copy_df = copy_df.astype('float64')
copy_df.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
dtype: object

In [49]:
write(db.copy_data, copy_df)

{'insertedCount': 5}

In [50]:
for doc in db.copy_data.find({}):
    pprint.pprint(doc)

{'0': 1.0,
 '1': 0.09490966796875,
 '2': 0.050140380859375,
 '3': 0.06329345703125,
 '4': 0.150390625,
 '_id': ObjectId('63d9507532f6a7c10981a244')}
{'0': 0.09490966796875,
 '1': 1.0,
 '2': 0.0660400390625,
 '3': 0.02777099609375,
 '4': 0.0220184326171875,
 '_id': ObjectId('63d9507532f6a7c10981a245')}
{'0': 0.050140380859375,
 '1': 0.0660400390625,
 '2': 1.0,
 '3': 0.08807373046875,
 '4': 0.0465087890625,
 '_id': ObjectId('63d9507532f6a7c10981a246')}
{'0': 0.06329345703125,
 '1': 0.02777099609375,
 '2': 0.08807373046875,
 '3': 1.0,
 '4': 0.058685302734375,
 '_id': ObjectId('63d9507532f6a7c10981a247')}
{'0': 0.150390625,
 '1': 0.0220184326171875,
 '2': 0.0465087890625,
 '3': 0.058685302734375,
 '4': 1.0,
 '_id': ObjectId('63d9507532f6a7c10981a248')}
