In [3]:
# import library
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import json
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

In [4]:
# Read csv file
movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

In [5]:
# Rename column movie_id of credits data
credits.columns = ['id', 'title', 'cast', 'crew']

In [6]:
# Merge data
df = movies.merge(credits, on = 'id')

# Show first 5 row
df.head(5)

In [83]:
df.info()

In [9]:
# Changing the column type from json to string
def json_column(col):
    df[col] = df[col].apply(json.loads)
    for index,value in zip(df.index, df[col]):
        list1=[]
        for idx in range(len(value)):
            list1.append((value[idx]['name']))# the key 'name' contains the name of the genre
        df.loc[index,col]=str(list1)


In [10]:
# Changing the feature column type from json to string
features = ['genres', 'keywords', 'production_companies', 'spoken_languages', 'cast']
for feature in features:
    json_column(feature)

In [13]:
df.head(5)

In [12]:
# Column crew select job = director only
df['crew']=df['crew'].apply(json.loads)
def director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
df['director']=df['crew'].apply(director)


In [14]:
# Finding top 10 recieved most vote average
most_liked = df[['original_title', 'vote_average']].sort_values(by='vote_average' ,ascending = False)
top_10 = most_liked.head(10)
my_colors = [(x/10.0, x/20.0, 0.75) for x in range(len(df))]
top_10.plot.bar(x='original_title', y='vote_average', color=my_colors)

In [15]:
# Finding the movies that recieved the most vote
most_vost = df[['original_title', 'vote_count']].sort_values(by='vote_count' ,ascending = False)
top_10 = most_vost.head(10)
my_colors = [(x/10.0, x/20.0, 0.75) for x in range(len(df))]
top_10.plot.bar(x='original_title', y='vote_count', color=my_colors)


In [66]:
# Copy column genres 
genres = df['genres']

# Clean column genres , convert list to string
genres_all = genres.astype(str).str.replace("'","").str.strip("[]").str.split(',')
list_genres = []
for genre in genres_all:
    for i in genre:
        list_genres.append(i)
genre_wordcloud = ''.join(list_genres)

plt.figure(figsize = (12, 8))
wordcloud = WordCloud().generate(genre_wordcloud)
plt.imshow(wordcloud)
plt.title('Top genres')
plt.axis("off")
plt.show()

In [82]:
# Mean of vote
mean = df['vote_average'].mean()

# Finding movie that vote average more than mean vote
movie_above_mean =  df[df['vote_average']>=mean]

# Number of movies that vote_average above mean
len(movie_above_mean.index)

movie_above_mean[['original_title', 'vote_average']].sort_values(by='vote_average', ascending=False)

In [117]:
df['year'] = pd.to_datetime(df['release_date'] ,errors = 'coerce',format = '%Y-%m-%d').dt.strftime("%Y")
# pd.DatetimeIndex(df['release_date']).year
df.groupby(['year']).size().plot()
plt.title("Movie in year")
plt.xlabel("year")
plt.ylabel("no. of movie")

In [97]:
# Compare revenur with budget
fig, ax = plt.subplots()
ax.scatter(df['revenue'],df['budget'])
ax.set_xlabel("revenue")
ax.set_ylabel("budget")
ax.set_title("revenus vs budget")
plt.show()


# Content-based filtering

**Text-based similaeirties with overview data** 

In [118]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [119]:

# Instantiate the vectorizer object to the vectorizer variable
vectorizer = TfidfVectorizer()

# Fill na with ''
df['overview'] = df['overview'].fillna('')

# Fit and transform the plot column
vectorized_data = vectorizer.fit_transform(df['overview'])

# Create DataFrame
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())

# Assign the movie titles to the index
tfidf_df.index = df['original_title']
print(tfidf_df.head())


In [120]:
# Compating all movies with TF-IDF by using cosine similarity measure
from sklearn.metrics.pairwise import cosine_similarity

# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, columns=tfidf_df.index, index=tfidf_df.index)
cosine_similarity_df

In [121]:
# Content-based recommendation by TF-IDF
def get_recommendation(title, number_of_movies):
    movies_recommend_series = cosine_similarity_df.loc[title]
    movies_recommend = movies_recommend_series.sort_values(ascending = False)
    top_10_recommend = movies_recommend.iloc[1:number_of_movies+1]
    return top_10_recommend


In [122]:
get_recommendation('The Lion King', 10)

# Content based with Jaccard

In [154]:
# New DataFrame for title and genres
df_genres = df[['original_title', 'genres']].copy()
df_genres

In [155]:
# Change object type to str type and strip word to list
df_genres['genres'] = df_genres['genres'].astype(str).str.replace("'","").str.strip("[]").str.split(',')
len(df_genres)

In [156]:
# Conver list of genres to each row with title
list_data = []
for index, genres in zip(df_genres['original_title'], df_genres['genres']):
    for g in genres:
        list_data.append([index,g])
df_new_genres = pd.DataFrame(list_data, columns=['title', 'genres'])
df_new_genres['genres'] = df_new_genres['genres'].str.replace('"','')
df_new_genres

In [157]:
# Pepare to usable data
movie_cross_table = pd.crosstab(df_new_genres['title'], df_new_genres['genres'])
movie_cross_table

In [158]:
# Comparing all your movies at once
import numpy as np
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

# Calculate all pairwise distances
jaccard_distances = pdist(movie_cross_table.values, metric='jaccard')

# Convert the distances to a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)
jaccard_similarity_array

# Wrap the array in a pandas DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=movie_cross_table.index, columns=movie_cross_table.index)

# Print the top 5 rows of the DataFrame
print(jaccard_similarity_df.head())

In [159]:
# Find the values for the movie Thor
jaccard_similarity_series = jaccard_similarity_df.loc['Avatar']

# Sort these values from highest to lowest
ordered_similarities = jaccard_similarity_series.sort_values(ascending=False)

# Print the results
print(ordered_similarities)

In [182]:
# Content-based recommendation by genres and jaccard
def get_recommendation_2(title,number_of_movies):
    movies_recommend_series = jaccard_similarity_df.loc[title]
    movies_recommend = movies_recommend_series.sort_values(ascending = False)
    top_recommend = movies_recommend.iloc[1:number_of_movies+1]
    return top_recommend

In [183]:
get_recommendation_2('Avatar',10)

# Collaborative filtering

In [163]:
rating = pd.read_csv('/kaggle/input/the-movies-dataset/ratings_small.csv')
rating

In [164]:
# Transform the table (user_id must be index)
user_ratings_df = rating.pivot(index='userId', columns='movieId',values='rating')
user_ratings_df.head()

In [165]:
# Challenges with missing values

# Fill in missing values with zero
user_ratings_df = user_ratings_df.fillna(0)
user_ratings_df

In [166]:
# Compensating for incomplete data

# Get the average rating for each user 
avg_user_ratings = user_ratings_df.mean(axis=1)
avg_user_ratings

In [167]:
# Center each users ratings around 0
user_ratings_centered = user_ratings_df.sub(avg_user_ratings, axis=0)
user_ratings_centered

In [168]:
# Fill in the missing data with 0s
user_ratings_normed = user_ratings_centered.fillna(0)
user_ratings_normed

# Item Based Recommendation

In [191]:
movie_ratings_normed = user_ratings_normed.T
movie_ratings_normed

In [192]:
# Finding similarly liked movies

from sklearn.metrics.pairwise import cosine_similarity

# Generate the similarity matrix
similarities = cosine_similarity(movie_ratings_normed)


In [193]:
# Wrap the similarities in a DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=movie_ratings_normed.index, columns=movie_ratings_normed.index)
cosine_similarity_df

In [199]:
cosine_similarity_df.loc[1].sort_values(ascending=False)

In [197]:
# User-based of Collaborative filtering recommendation
def get_recommendation_3(title,number_of_movies):
    movies_recommend_series = cosine_similarity_df.loc[title]
    movies_recommend = movies_recommend_series.sort_values(ascending = False)
    top_recommend = movies_recommend.iloc[1:number_of_movies+1]
    return top_recommend

In [198]:
get_recommendation_3(1,10)

# User Based Recommendation

In [200]:
user_ratings_normed

In [201]:
# Find top 10 similarity of target user
user_similarity_target = user_ratings_normed.iloc[1]
user_similarity_target_ordered = user_similarity_target.sort_values(ascending=False)
top_10_similarity = user_similarity_target_ordered[1:11].index

top_10_similarity


In [202]:
# Extract ratings of other user
neighbor_ratings = user_ratings_normed.reindex(top_10_similarity)
neighbor_ratings

In [203]:
# User-based of Collaborative filtering recommendation
def get_recommendation_4(title,number_of_movies):
    movies_recommend_series = neighbor_ratings.loc[title]
    movies_recommend = movies_recommend_series.sort_values(ascending = False)
    top_recommend = movies_recommend.iloc[1:number_of_movies+1]
    return top_recommend

In [205]:
get_recommendation_4(589,10)