In [None]:
# Import the necesary modules
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import RegexpTokenizer
from sklearn import preprocessing
from scipy.sparse import hstack
import pandas_profiling

In [None]:
# Import dataset into pandas DataFrame and remove unknown indexing column
df = pd.read_csv(r"/kaggle/input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv")
df = df.iloc[:,1:]

# 1. Dataset Exploration

**Data Attributes**

1. ID: Unique identifier for each record
2. Title: Name of the movie
3. Year: Release year of the movie
4. Age: Target age group
5. IMDb: IMDB movie rating (/10)
6. Rotten Tomatoes: Rotten Tomatoes % rating
7. Netflix: Movie is found on netflix (1/0)
8. Hulu: Movie is found on Hulu (1/0)
9. Prime Video: Movie is found on Prime Video
10. Disney+: Movie is found on Disney+ (1/0)
11. Type: Movie or TV show
12. Directors: Name of director
13. Genres: Genre category
14. Country: Country of origin
15. Language: Original version language
16. Runtime: Duration of the movie

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.Type.unique()

# 2. Handling Missing Values



In [None]:
# Find missibg values in all columns...
missing_values = pd.DataFrame(df.isnull().sum())
missing_values = missing_values.rename(columns={0:"missing_count"})
missing_values['missing_values_%'] = (missing_values.missing_count/len(df.ID))*100
missing_values

**Since Age and Rotten tomatoes have > 50% missing values we could:**

* Drop columns which have > 50% missing values
* Drop NA from IMDb, Directors, Genres, Country, Language, and Runtime column
* Reset index

We can also transform the year column into object type leaving it ready for plotting.

In [None]:
# Dropping values with missing % more than 50%
df.drop(['Rotten Tomatoes','Age'], axis=1, inplace=True)

# Dropping NaN's from the following columns
df.dropna(subset=['IMDb', 'Directors', 'Genres', 'Country', 'Language', 'Runtime'], inplace=True)
df.reset_index(inplace=True, drop=True)

# Converting year into object type
df.Year = df.Year.astype('object')

In [None]:
df.info()

# 3. Exploratory Data Analysis


**Statistical exploration will include:**

1. Distribution plots
2. Distribution of movies on each streaming platform
3. Movie distributions according to:
    * Genre
    * Country
    * Language
4. IMDb distribution on each platform
5. Runtime analysis per platform and age group

# Year distribution:

In [None]:
# Distribution of origin years
plt.figure(figsize=(20,5))
sns.distplot(df['Year'])
plt.show()

> This plot shows how most of the films catalogued in the steaming plaforms Hulu, Netflix, Prime Video and Disney+ were created/released between the years 2000 to 2020.

> **What can this tell us about the streaming platforms' target audicence?**

# IMDb ratings distribution:

In [None]:
# Distribution of IMDb ratings
plt.figure(figsize=(20,5))
sns.distplot(df['IMDb'])
plt.show()

> The **IMDb scores distribution** is slightly negatively skewed. We can observe that the **mean** IMDb rating throughout the dataset lies around **6.5**.

# Runtime distribution:

In [None]:
# Distribution of Runtime data
sns.distplot(df['Runtime'])
plt.show()

> Regarding the above **Runtime distribution** we can observe how the dataset's mean film runtime lies around **100 minutes**.

To have understand how these movies are distributed accross the different services, we can start by creating a function that counts movies for a given streaming service. We can then plot the data to understand how the films are distributed accross the platforms.

# Movies on each platform:

In [None]:
def movie_count(platform: str, count=False):
    """Calculates the number of movie per given streaming platform"""
    if count == False:
        print('Platform {} Count: {}'.format(platform, df[platform].sum()))
    else:
        return df[platform].sum()

In [None]:
# Count of movies/shows on each streaming platform
movie_count('Netflix')
movie_count('Hulu')
movie_count('Prime Video')
movie_count('Disney+')

In [None]:
# Set up movies on each platform for visualisation
labels = ["Prime Video", "Netflix", "Hulu", "Disney+"]
data = [movie_count('Prime Video', count=True),
        movie_count('Netflix', count=True),
        movie_count('Hulu', count=True),
        movie_count('Disney+', count=True)]

explode = (0.1, 0.1, 0.1, 0.1)

# Plotting the data into a pie chart
fig1, ax1 = plt.subplots()
ax1.pie(data,
       labels=labels,
       autopct='%1.1f%%',
       explode=explode,
       shadow=True,
       startangle=100)

ax1.axis = ('equal')
plt.show()

> The dataset seems to contain mainly films hosted on Prime Video (71.3%) followed by Netflix (19.9%) and a similar relative percentage of movies can be found on Hulu (5.4%) and Disney+ (3.4%).

# Movie/Genre distribution:

In [None]:
# Split genres by "," and stack one after the other
genres = df['Genres'].str.split(",").apply(pd.Series, 1).stack()
genres.index = genres.index.droplevel(-1)
genres.name = "Genres"

# Delete genres column and add join new column with exsiting DataFrame
del df['Genres']
df_genres = df.join(genres)

In [None]:
# Count of movies according to genre
plt.figure(figsize=(15,5))
sns.countplot(x="Genres", data=df_genres)
plt.xticks(rotation=90)
plt.show()

> It seems like most of the movies hosted by the streaming services are labeled as **dramas** and **comedies**. Nevertheless, it might also be that these are the genres that most films have in common (Comedy/Romances or Mystery/Dramas for example).

# Movie/Country distribution:

In [None]:
# Split the Country by "," and stack one after the other
countries = df['Country'].str.split(",").apply(pd.Series, 1).stack()
countries.index = countries.index.droplevel(-1)
countries.name = 'Country'

# Delete genres column and add join new column with exsiting DataFrame
del df['Country']
df_country = df.join(countries)

In [None]:
# Plotting top 10 countries and their respective movie count
df_country['Country'].value_counts()[:10].plot(kind='bar', figsize=(15,5))
plt.show()

> We can observe how the vast majority of films contained in the extracted catalogues were made in the United States.

# Movie/Language distribution:

In [None]:
# Perform stacking operation on language column
languages = df['Language'].str.split(',').apply(pd.Series, 1).stack()
languages.index = languages.index.droplevel(-1)
languages.name = "Language"

# Substitute new column into existing DataFrame
del df['Language']
df_language = df.join(languages)

In [None]:
# Plotting top 10 languages and movie count
df_language['Language'].value_counts()[:10].plot(kind='bar', figsize=(15,3))
plt.show()

> As expected, most of the content that the 4 streaming services include in their catalogue is in English.

# IMDb-Rating/Platform distribution:

In [None]:
# Melting platform columns to create visualisation
df2 = pd.melt(df, id_vars=["ID", "Title", "Year", "IMDb", "Type", "Runtime"], var_name="platform")
df2 = df2[df2.value==1]
df2.drop(columns=["value"], axis=1, inplace=True)

In [None]:
# Distribution of IMDb rating in different platforms
rating_platform = sns.FacetGrid(df2, col="platform")
rating_platform.map(plt.hist, "IMDb")
plt.show()

> It is easier to distinguis this distributions in platforms like Prime Video since the dataset contains 11289 movies hosted by this service. Overall, ratings tend to peak around the **6-6.5 points** for all these platforms. Prime Video's catalogue seems to have quite a few low rating movies (Not to blame, it must be unevitable when you have such a huge list of movies to offer on demand).

# Runtime/Platform/Age-Group distribution:


Since I previously had to drop the Age column due to the high number of NaN values it contained, I will start by re-loading the dataset csv file.

In [None]:
# Load dataset
df = pd.read_csv(r"/kaggle/input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv")
df = df.iloc[:,1:]
df.ID = df.ID.astype("object")

# Melting platform columns (from wide to long) to create visualisations
df2 = pd.melt(df, id_vars=['ID','Title','Year','Age','IMDb',"Rotten Tomatoes","Type", "Runtime"], var_name="platform")
df2 = df2[df2.value==1]
df2.drop(columns=['value'], axis=1, inplace=True)

In [None]:
# Plotting total runtime across the different platforms in terms of age groups
ax = sns.barplot(x="platform", y="Runtime", hue="Age", estimator=sum, data=df2)

> This is an interesting plot. We make a few notes out of this plot:
> * Prime Video seems to be king in 18+ content
> * Hulu's main target segment also seems to be aged 18+    
> * Even though Netflix also peaks at the 18+, it provides quite a lot of content for the younger 13+ audiences
> * Disney+ provides content for the younger and all audiences

# 4. Recommendations Systems

There are three main methods used to build recommendation systems:

* **Content-Based Methods**: Define a model for users and items it interacted with. Recommends the user similar items based on item features.
* **Collaborative Filtering Methods**: Can filter out items that a user might like on the basis of reactions by similar users.
* **Hybrid Methods**: Use both content adn collaborative methods to achieve a better result.

# Reload

In [None]:
# Reading data once again to reset changes
df = pd.read_csv(r"/kaggle/input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv")
df = df.iloc[:,1:]

# Finding missing values in all columns
missing_values = pd.DataFrame(df.isnull().sum())
missing_values = missing_values.rename(columns={0:"missing_count"})
missing_values["missing_%"] = (missing_values.missing_count/len(df.ID))*100

# Dropping values with missing % >50%
df.drop(['Rotten Tomatoes', 'Age'], axis=1, inplace=True)

# Dropping NaN values from the following columns
df.dropna(subset=['IMDb', 'Directors', 'Genres', 'Country', 'Language', 'Runtime'], inplace=True)
df.reset_index(inplace=True, drop=True)

# Convert to object type
df.ID = df.ID.astype('object')
df.Year = df.Year.astype('object')

# Numerical System

This model will use cosine similarity to make recommendations.

# Preprocessing

In [None]:
# Select the numerical variable
num_df = df.select_dtypes(include=['float64', 'int64'])

In [None]:
# Scaling the numerical variable using a min-max scaler to reduce model complexity and training time
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
trans_df = pd.DataFrame((scaler.fit_transform(num_df)))
trans_df.columns = num_df.columns
trans_df.head()

In [None]:
# Compute cosine similarity
cos_sim = cosine_similarity(trans_df, trans_df)

# Reverse mapping of indices and movie titles
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()
indices.head()

# Recommendation Function

* The function takes 2 arguments: movie title, similarity score.
* Searches the index of the title corresponding to the original_titles index in out series of indices.
* Gets the pairwise similarity scores of all the movies.
* Sorts the similarity scores in descending order and converts them into a list.
* Returns the top 10 movie titles from the dataset.

In [None]:
def give_recommendation(title, sig=cos_sim):
    """Takes a movie title and similarity score and returns a top 10 list of 
    recommended movies from the dataset"""
    
    # Get index corresponding to the original title
    idx = indices[title]
    
    # Get the pairwise similarity scores
    sig_scores = [*enumerate(sig[idx])]
    
    # Sort the movies
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    
    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]
    
    # Movie indices
    movie_indices = [i[0] for i in sig_scores]
    
    # Top 10 similar movies
    return df['Title'].iloc[movie_indices]
    

In [None]:
give_recommendation("The Matrix", sig=cos_sim)

> The result seems to be pretty poor since it is only based on movie ratings, runtimes and platform variables. This can be improved by using the other qualitative variables.

# Numerical & Textual System

This model will convert text columns into a single column and then use TF-IDF Vectorizer to create a sparse matrix of all the words TF-IDF score. Then the scaled quantitative variables will be added to the sparse matrix.

In [None]:
# Reading data once again to reset changes
df = pd.read_csv(r"/kaggle/input/movies-on-netflix-prime-video-hulu-and-disney/MoviesOnStreamingPlatforms_updated.csv")
df = df.iloc[:,1:]

# Finding missing values in all columns
missing_values = pd.DataFrame(df.isnull().sum())
missing_values = missing_values.rename(columns={0:"missing_count"})
missing_values["missing_%"] = (missing_values.missing_count/len(df.ID))*100

# Dropping values with missing % >50%
df.drop(['Rotten Tomatoes', 'Age'], axis=1, inplace=True)

# Dropping NaN values from the following columns
df.dropna(subset=['IMDb', 'Directors', 'Genres', 'Country', 'Language', 'Runtime'], inplace=True)
df.reset_index(inplace=True, drop=True)

# Convert to object type
df.ID = df.ID.astype('object')
df.Year = df.Year.astype('object')

# Preprocessing

* Selecting all object data types and storing them in a list.
* Removing ID and Title column.
* Joining all text/object columns using commas into a single column.
* Creating a tokenizer to remove unwanted elements from the data like symbols and numbers.
* Converting TfidfVector from the text after quant columns preprocessing.
* Inserting quant variables into a DataFrame.
* Scaling quant using MixMaxScaler range(0,1).
* Adding quant variables in the TF-IDF vectors sparse matrix using hstack function (add horizontal arrays into a sparse matrix).

In [None]:
df.info()

In [None]:
def preprocess(df):
    """Conducts preprocessing process detailed above over the input DataFrame"""
    
    # Store all object columns in a list
    objects = list(df.select_dtypes(include=['object']).columns)
    
    # Removing ID and Title column
    objects.remove("Title")
    objects.remove('ID')
    
    # Joining all text/object columns delimited by comma
    df['all_text'] = df[objects].apply(lambda x: ",".join(x.dropna().astype(str)), axis=1)
    
    # Create tokenizer to remove unwanted elements from data (symbols, numbers...)
    token = RegexpTokenizer(r'[a-zA-Z]+')
    
    # Convert TfidfVector from text
    cv = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,1), tokenizer=token.tokenize)
    text_counts = cv.fit_transform(df['all_text'])
    
    # Select quantitative variables and scale them
    num_df = df.select_dtypes(include=['float64', 'int64'])
    scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
    scaled_num_df = pd.DataFrame(scaler.fit_transform(num_df))
    scaled_num_df.columns = num_df.columns
    
    # Add quantitative variables to the TF-IDF vector
    IMDb = scaled_num_df.IMDb.values[:,None]
    X_train_dtm = hstack((text_counts, IMDb))
    netflix = scaled_num_df.Netflix.values[:, None]
    X_train_dtm = hstack((X_train_dtm, netflix))
    hulu = scaled_num_df.Hulu.values[:, None]
    X_train_dtm = hstack((X_train_dtm, hulu))
    prime = scaled_num_df["Prime Video"].values[:, None]
    X_train_dtm = hstack((X_train_dtm, prime))
    disney = scaled_num_df["Disney+"].values[:, None]
    X_train_dtm = hstack((X_train_dtm, disney))
    runtime = scaled_num_df['Runtime'].values[:, None]
    X_train_dtm = hstack((X_train_dtm, runtime))
    
    return X_train_dtm

In [None]:
# Preprocessing data
mat = preprocess(df)
mat.shape

# Improved recommendations:

In [None]:
# Compute the sigmoid kernel
sig2 = cosine_similarity(mat, mat)

# Reverse mapping of indices and movie titles
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

# Getting a recommendation
give_recommendation("The Matrix", sig=sig2)

> This model clearly produces a higher quality result with films that are pretty similar to "The Matrix". Adding more relevant data like descriptive text, content-based recommendations systems can improve their performance.