In [None]:
from collections import defaultdict, Counter
from functools import partial
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import pandas as pd
from pandas_profiling import ProfileReport
from pywaffle import Waffle
import squarify
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from wordcloud import WordCloud

pd.set_option("display.max_columns", None)
%matplotlib inline

In [None]:
#load data
PATH = "../data"
print(os.listdir(PATH))

In [None]:
credit = pd.read_csv(os.path.join(PATH, 'tmdb_5000_credits.csv'))
movie = pd.read_csv(os.path.join(PATH, 'tmdb_5000_movies.csv'))

In [None]:
credit.head(1)

In [None]:
movie.head(1)

In [None]:
#merge 
credit.columns = ['id','tittle','cast','crew']
movie_df = movie.merge(credit, on='id')
del movie
del credit

In [None]:
movie_df.head(1)

# **Data prep <a class="anchor" id="Data_preparation"></a>**

In [None]:
#convert date 
movie_df["release_date"] = pd.to_datetime(movie_df['release_date'])
movie_df['release_year'] = movie_df['release_date'].dt.year
movie_df['release_month'] = movie_df['release_date'].dt.month_name()
del movie_df["release_date"]

In [None]:
#json string into json format
json_columns = {'cast', 'crew', 'genres', 'keywords', 'production_countries', 
                'production_companies', 'spoken_languages'}

for c in json_columns:
    movie_df[c] = movie_df[c].apply(json.loads)
    if c != "crew": 
        movie_df[c] = movie_df[c].apply(lambda row: [x["name"] for x in row])

In [None]:
#add director, writer and producer 
def get_job(job, row):
    person_name = [x['name'] for x in row if x['job']==job] 
    return person_name[0] if len(person_name) else np.nan

movie_df["director"] = movie_df["crew"].apply(partial(get_job, "Director"))
movie_df["writer"]   = movie_df["crew"].apply(partial(get_job, "Writer"))
movie_df["producer"] = movie_df["crew"].apply(partial(get_job, "Producer"))
del movie_df["crew"]

In [None]:
#add profit
movie_df["profit"] = movie_df["revenue"] - movie_df["budget"]

In [None]:
#sub nas by most frequent
for col in ["runtime", "release_year", "release_month"]:
    movie_df[col] = movie_df[col].fillna(movie_df[col].mode().iloc[0])

In [None]:
movie_df.head(2)

# **Recommender System** <a class="anchor" id="Recommender_systems"></a>

In [None]:
#weighted average
C = movie_df['vote_average'].mean()
m = movie_df['vote_count'].quantile(0.9)
C, m

In [None]:
q_movies = movie_df.copy().loc[movie_df['vote_count'] >= m]
q_movies.shape

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies = q_movies.sort_values('score', ascending=False)
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(5)

In [None]:
with open('../data/movie_scores.pickle', 'wb') as handle:
    pickle.dump(q_movies[['id', 'title','score']], handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#content based
movie_df['overview'].head(3)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

movie_df['overview'] = movie_df['overview'].fillna('')

tfidf_matrix = tfidf.fit_transform(movie_df['overview'])
tfidf_matrix.shape

In [None]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    movie_similarity = [i[1] for i in sim_scores]

    return pd.DataFrame(zip(movie_df['title'].iloc[movie_indices], movie_similarity), columns=["title", "similarity"])

In [None]:
get_recommendations('The Godfather: Part II')

In [None]:
get_recommendations('The Avengers')

In [None]:
get_recommendations('Pulp Fiction')

In [None]:
#save df
scipy.sparse.save_npz('../data/tfidf_matrix.npz', tfidf_matrix)

with open('../data/movie_df.pickle', 'wb') as handle:
    pickle.dump(movie_df[["id","title"]], handle, protocol=pickle.HIGHEST_PROTOCOL)