In [None]:
# Importing Neccessary Packages:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
# To mount drive:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# TFIDF Vectorizer:
tfidf = TfidfVectorizer(stop_words="english")

In [None]:
# Importing Datasets:

credits = pd.read_csv('/content/drive/MyDrive/USML_Data/credits.csv')
movie_metadata = pd.read_csv('/content/drive/MyDrive/USML_Data/movies_metadata.csv')

In [None]:
# Function to change dtype of id from string to int:
def change_type_id(x):
    try:
        return int(x)
    except ValueError:
        return None

In [None]:
# Function to extract Genres and 
def extract_info2(x):
    x_list = list()
    x = list(eval(str(x)))
    for i in x:
      x_list.append(i["name"])
    
    string = ", ".join(x_list)
    
    return string

In [None]:
# Extract Actor Names and Movie Characters:

def extract_info1(cast, col_name):
    actors = list()
    cast = list(eval(cast))
    for i in cast:
      actors.append(i[col_name])
    
    list_of_actors = ", ".join(actors)
    
    return list_of_actors

In [None]:
def write_files(data, file_name):
  data.to_csv("/content/drive/MyDrive/USML_Data/Cleaned Data/" + file_name)
  return


### DATA PRE-PROCESSING:

In [None]:
# Data Preprocessing:

def data_preprocessing(movie_metadata, credits):

  start = time.time()
  print("Extracting Relevant Metadata...")
  lim_movie_metadata = movie_metadata[["id", "original_title", "overview", "genres"]]
  print("Imputing Missing Values in Overview...")
  lim_movie_metadata["overview"].fillna("", inplace = True)
  print("Extracting Genres...")
  lim_movie_metadata["list_of_genres"] = lim_movie_metadata["genres"].apply(extract_info2)
  print("Changing dtype of 'id' from string to int...")
  lim_movie_metadata["id"] = lim_movie_metadata["id"].apply(change_type_id)

  print("Extracting Actor and Character Names for All Movies...")
  credits["list_of_actors"] = credits["cast"].apply(extract_info1, col_name = "name")
  credits["list_of_characters"] = credits["cast"].apply(extract_info1, col_name = "character")
  print("Creating Dataframe of Extracted Credits Data...")
  lim_credits = credits[["id", "list_of_actors", "list_of_characters"]]

  print("Size of Credits: {}\tSize of Metadata: {}\n".format(lim_credits.shape, lim_movie_metadata.shape))

  print("Merging All Extracted Data...")
  extracted_movie_data = lim_credits.merge(lim_movie_metadata, on = "id", how = "left")
  metadata = extracted_movie_data[["id", "original_title", "overview", "list_of_actors", "list_of_characters", "list_of_genres"]]

  print("Creating More Attributes (Combination of two or more columns)...")
  metadata['metadata'] = metadata[metadata.columns[1:]].apply(lambda x: ', '.join(x.dropna().astype(str)), axis = 1) # all combined
  metadata['overview_genre'] = metadata[metadata.columns[[2,5]]].apply(lambda x: ', '.join(x.dropna().astype(str)), axis = 1) # overview + genre
  metadata['overview_actors'] = metadata[metadata.columns[[2, 3]]].apply(lambda x: ', '.join(x.dropna().astype(str)), axis = 1) # overview + actors
  print("Size of Merged Data: {}".format(metadata.shape))
  end = time.time()
  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)
  print("\nTime Taken for Pre-processing:{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

  return metadata

### COMPUTING SIMILARITY MATRIX:

In [None]:
# Function to create Similarity Matrix using Cosine Similarity or Linear Kernel:

def compute_similarity_matrix(metadata, col, formula):
  start = time.time()
  tfidf_mat = tfidf.fit_transform(metadata[col])
  print("\nShape of TFIDF Matrix: {}".format(tfidf_mat.shape))
  if formula == "linear kernel": 
    sim_mat = linear_kernel(tfidf_mat, tfidf_mat)
  elif formula == "cosine":
    sim_mat = cosine_similarity(tfidf_mat, tfidf_mat)
  else:
    print("Invalid Formula!")

  end = time.time()
  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)
  print("\nTime Taken to Compute Similarity Matrix:{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
  return sim_mat


### RECOMMENDING MOVIES:

In [None]:
# Function to compute Content - Based Recommendation System:
def content_based_recommender(input_string, sim_mat, topk, metadata, movie_mapper):
  movie_index = movie_mapper[input_string]
  sim_score = list(enumerate(sim_mat[movie_index]))
  sim_score = sorted(sim_score, key = lambda x: x[1], reverse = True)
  sim_score = sim_score[1:topk]
  movie_indices = [i[0] for i in sim_score]

  return (metadata["original_title"].iloc[movie_indices])

In [None]:
def create_movie_mapper(metadata, col_name):
  movie_mapper = pd.Series(metadata.index, index = metadata[col_name])
  return movie_mapper

In [None]:
if __name__ == "__main__":
  # Getting Pre-processed Data:
  data = data_preprocessing(movie_metadata, credits)
  # Computing Similarity Matrix:
  on_col = "overview"
  sim_mat = compute_similarity_matrix(data, on_col, "cosine")
  # Creating Movie Names Mapper:
  movie_mapper = create_movie_mapper(data, "original_title")
  # Getting Recommendations:
  topk = 15
  for mname in ["Star Wars", "Toy Story", "Jumanji"]:
    print("\nTop {} Recommendations for {} based on {}:\n".format(topk, mname, on_col.replace("_", " and ").title().replace("And", "and")))
    recommendations = content_based_recommender(mname, sim_mat, topk, data, movie_mapper)
    recs = pd.DataFrame(recommendations)
    print(recs)

Extracting Relevant Metadata...
Imputing Missing Values in Overview...
Extracting Genres...
Changing dtype of 'id' from string to int...
Extracting Actor and Character Names for All Movies...
Creating Dataframe of Extracted Credits Data...
Size of Credits: (45476, 3)	Size of Metadata: (45466, 5)

Merging All Extracted Data...
Creating More Attributes (Combination of two or more columns)...
Size of Merged Data: (45538, 9)

Time Taken for Pre-processing:00:00:52.93

Shape of TFIDF Matrix: (45538, 75827)

Time Taken to Compute Similarity Matrix:00:00:27.53

Top 15 Recommendations for Star Wars based on Overview:

                                          original_title
1157                             The Empire Strikes Back
30498                      The Star Wars Holiday Special
26616                       Star Wars: The Force Awakens
1170                                  Return of the Jedi
34220                   Maciste alla corte del Gran Khan
1270                                    