# Imports

In [None]:
import numpy as np
import pandas as pd 
import datetime as dt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Loading the dataset

In [None]:
def loader (excel) :
    
    df = pd.DataFrame(pd.read_excel(excel))
    
    df = df.drop(["Unnamed: 0","date_added"], axis = 1)
    
    return df

streaming_content = loader('streaming_content.xlsx')

# Customization (optional)

### Platforms

Possible answers :

Netflix, Disney +, Prime Video, Hulu, all.

In [None]:
filter_platform = str(input("Do you want recommendations based on all the platforms ? : "))

list_platform = []

if filter_platform == 'no' :
    
    ptf_number = int(input("Number of non desired platforms ? : "))
    
    for i in range(0,ptf_number) :
        
        platform_i = str(input("Non desired platform : "))
        
        list_platform.append(platform_i)

### Rating IMDb

Secondly, what is the minimum grade on IMDb the movie or the TV show must have ? (Data not available for all the dataset) ? :

0.1 to 10, float number granted (0 means that the IMDb rating is not a relevant feature).

--------------------------

Thirdly, what is the minimum of IMDb reviews the movie or the TV show must have ? :

No limitation (0 means that the IMDb rating is not a relevant feature).

In [None]:
rating = float(input("Minimum rating on IMDb ? : "))

rewiews = int(input("Minimum number of reviews ? : "))

### Country

Please select a country, if you have no preference, please enter 0. 

In [None]:
country = str(input("Selected country : "))

### Release year

Please select a year and precise if you want the shows of this year or the shows of this year and the folowing ones (equal vs minimum). 

Enter 0 in one or both categories if year is not an important feature for you. 

In [None]:
year = int(input("Enter a release year : "))
quality = str(input("Show of the year ? Minimum year of release ? : "))

### Type

Do you want recommendations for TV Show or for Movies ? (tvshow, movie, all)

In [None]:
show = str(input("TV Show, Movie or all : "))


### Classification

For this algorithm, we use a simplified version of the American rating :

Not Rated, All ages, 7+, PG-13, 16+, R-Rated

In [None]:
list_filter_classification = []
filter_classification = str(input("Do you want any filter ? : "))

if filter_classification == "yes":
    
    number_filter = int(input("Number of classification filter : "))
    
    for i in range(0,number_filter): 
        
        class_i = str(input("Non-Desired classification : "))
        
        list_filter_classification.append(class_i)

### Category

Please enter how many filter in category do you want to apply (negative filter). 

Please enter the filter with the first letter in capital. 

In [None]:
filter_number = int(input("Number of category filter : "))
list_filter_category = []

for i in range(0, filter_number) :
    
    filter_i = str(input("Enter a non desired category : "))
    
    list_filter_category.append(filter_i)

# Content Based Recommendation Algorithm

### Data Cleaning

Bias ! Categories are not similar from platform to another, adding categories to the "soup" (bags of words) could orient the decision : most of the recommendations will be based on one platform. 

In [None]:
#features = ["title","director","cast","listed_in","description"]
features = ["title","director","cast","description"]
#features = ["title","description"]
#features = ["title","director","description"]

In [None]:
reco_streaming = streaming_content[features]

In [None]:
def fillna(df) :
    
    return df.fillna(" ")

reco_streaming = fillna(reco_streaming)

In [None]:
def clean_data(col):
        return str.lower(col.replace(" ", ""))

In [None]:
def cleaner(df, feat) :
    
    for feature in feat:
        df[feature] = df[feature].apply(clean_data)
        
    return df

reco_streaming = cleaner(reco_streaming, features)

### Creation of a soup 

Soup = bag of words, représentation du corpus => fusionner tous les mots pour récapituler un show et toutes les information dessus, les stop words seront supprimés. 

In [None]:
def create_soup(df):
    
    #df["soup"] = df['title']+ ' ' + df['director'] + ' ' + df['cast'] + ' ' +df['listed_in']+' '+ df['description']
    df["soup"] = df['title']+ ' ' + df['director'] + ' ' + df['cast'] + ' '+ df['description']
    #df["soup"] = df['title']+ ' ' + df['description']
    #df["soup"] = df['title']+ ' ' + df['director'] + ' '+ df['description']
    return df

reco_streaming = create_soup(reco_streaming)

### Pre-processing

In [None]:
def pre_processing (df) :

    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(df['soup'])

    cos_sim = cosine_similarity(count_matrix, count_matrix)
    
    df = df.reset_index()
    indices = pd.Series(df.index, index = df['title'])
    
    return df, cos_sim, indices

reco_streaming, cosine_sim, index = pre_processing(reco_streaming)

### Similarity

In [None]:
def similarity (title, cos_sim, idx, df1) :
    
    title=title.replace(' ','').lower()
    idx = idx[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cos_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    return sim_scores

simil = similarity(input('Enter a movie or a TV show name : '), cosine_sim, index, streaming_content)

In [None]:
def dataframe_simil(sim):
    
    df = pd.DataFrame(sim, columns = ["index","similarity"])
    
    return df

similarity = dataframe_simil(simil)

### Filtering dataset

In [None]:
def new_set(xlsx, df1) :
    
    df = loader(xlsx)
    
    df = df.reset_index()
    
    #Merging similarities
    
    df = df.merge(df1, how = 'left', left_on = 'index', right_on = 'index')
    
    return df

recommendation = new_set("streaming_content.xlsx", similarity)

In [None]:
def filtering_plateforms (df, list_ptf) :
    
    for i in range(0,len(list_ptf)) :
        
        df = df[df["origin"]!= list_ptf[i]]
        
    return df

recommendation = filtering_plateforms(recommendation, list_platform)

In [None]:
def filtering_imdb (df, rating, review) :
    
    if rating == 0 and review == 0 : 
        
        df = df
        
    elif rating != 0 and review == 0 :
        
        df = df[df["averageRating"] >= rating]
        
    elif rating == 0 and review != 0 :
        
        df = df[df["numVotes"] >= review]
        
    else :
        
        df = df[(df["averageRating"] >= rating) & (df["numVotes"] >= review)]
        
    return df

recommendation = filtering_imdb(recommendation, rating, rewiews)

In [None]:
def filtering_country(df, cnt):
    
    if cnt == "0" :
        
        df = df 
    
    else :
        
        df = df[df["country"]==cnt]
    
    return df

recommendation = filtering_country(recommendation, country)

In [None]:
def filtering_year (df, year, qual) :
        
    if (year != 0) and (qual == "equal") :
        
        df = df[df["release_year"] == year]
        
    elif (year != 0) and (qual == "minimum") :
        
        df = df[df["release_year"] >= year]
        
    else : 
        
        df = df
        
    return df

recommendation = filtering_year(recommendation, year, quality)

In [None]:
def filtering_type (df, shw):
    
    if shw == "tvshow":
        
        df = df[df["type"] == "TV Show"]
        
    elif shw == "movie":
        
        df = df[df["type"] == "Movie"]
        
    else :
        
        df = df
        
    return df

recommendation = filtering_type(recommendation, show)

In [None]:
def filtering_classification (df, list_classif) :
    
    for i in range(0,len(list_classif)) :
        
        df = df[df["rating"]!= list_classif[i]]
        
    return df

recommendation = filtering_classification(recommendation, list_filter_classification)

In [None]:
def filtering_category (df, filter_list) :
    
    for i in range(0,len(filter_list)) :
        
        df = df[~df["listed_in"].str.contains(filter_list[i])]
        
    return df

recommendation = filtering_category(recommendation, list_filter_category)

### Recommendation

In [None]:
def get_recommendations(df1) :
    
    df = df1.sort_values(by = ["similarity"], ascending = False)
    
    df = df[["title","description", "averageRating","origin","type"]].head(11)
    
    df.drop(df.index[:1], inplace = True)
    
    return df

In [None]:
final_recommendation = get_recommendations(recommendation)

In [None]:
final_recommendation

# Content Based Algorithm : Features and New

We will add a recency criteria and filter all the result thanks to it. 

### New filtering

How long does it take for a film to become old? (value has to be put in days)

In [None]:
days_new = int(input("How long does it take for a film to become old ? : "))

date_new = dt.datetime.now()-dt.timedelta(days = days_new)

### Filtering 

In [None]:
def loader_new(excel) :
    
    df = pd.DataFrame(pd.read_excel(excel))
    
    df = df.drop(["Unnamed: 0"], axis = 1)
    
    return df

In [None]:
def new_set_new(xlsx, df1) :
    
    df = loader_new(xlsx)
    
    df = df.reset_index()
    
    #Merging similarities
    
    df = df.merge(df1, how = 'left', left_on = 'index', right_on = 'index')
    
    return df

recommendation_new = new_set_new("streaming_content.xlsx", similarity)

In [None]:
def new_filtering (df, date) :
    
    df = df[df["date_added"] >= date]
    
    return df

recommendation_new = new_filtering(recommendation_new, date_new)

### Recommendation

In [None]:
def get_recommendations_new(df1) :
    
    df = df1.sort_values(by = ["similarity"], ascending = False)
    
    df = df[["title","description", "averageRating","origin","type"]].head(6)
    
    df.drop(df.index[:1], inplace = True)
    
    return df

final_new_recommendation = get_recommendations_new(recommendation_new)

In [None]:
final_new_recommendation