In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
import string
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


pd.set_option('display.float_format', lambda x: '%.5f' % x)


data_path = {
    'ratings': 'ratings.txt',
    'movies': 'netflix_titles.csv',
    'users': 'users.txt'
}

In [2]:

def load_data(data: str) -> pd.DataFrame:
    """
    Loads data from specified csv

    Args:
        data (str): name of the data 

    Returns:
        pd.DataFrame: dataframe with data loaded
    """
    df = pd.read_csv(data_path[data], \
                        sep=',', \
                        engine='python', \
                        encoding='utf_8_sig')

    return df






def movie_data_treatment(df_movies: pd.DataFrame) -> pd.DataFrame:
    """
    Creates custom id column and drops unnecessary columns

    Args:
        df_movies (pd.DataFrame): movies data 

    Returns:
        pd.DataFrame: dataframe with treated data
    """    
    # creates custom id column
    df_movies['idMovie'] = [i for i in range(1, len(df_movies) + 1)]
    # first 3952 movies
    movies_id = [i for i in range(0, 3952)]
    df_movies = df_movies.iloc[movies_id]
    # selects necessary columns
    movie_keep_cols = ['title', 'idMovie', 'description']
    df_movies = df_movies[movie_keep_cols]
    
    return df_movies
    
    
    
    
def ratings_data_treatment(df_ratings: pd.DataFrame) -> pd.DataFrame:
    # drop timestamp column
    df_ratings = df_ratings.drop('timestamp', axis=1)
    return df_ratings




In [3]:


def clean_descriptions(df_movies: pd.DataFrame) -> pd.DataFrame:
    df_movies["description"] = df_movies["description"].str.lower()
    df_movies["description"] = df_movies["description"].apply(word_tokenize)


    list_stopwords = set(stopwords.words("english") + list(punctuation))
    df_movies["description"] = df_movies["description"].apply(
        lambda x: [word for word in x if word not in list_stopwords]
    )


    df_movies["description"] = df_movies["description"].apply(
        lambda x: [word.translate(str.maketrans("", "", string.punctuation)) for word in x]
    )
    df_movies["description"] = df_movies["description"].apply(
        lambda x: [word for word in x if len(word) > 0]
    )

    df_movies["description"] = df_movies["description"].apply(lambda x: list(set(x)))

    df_movies["description"] = df_movies["description"].apply(
        lambda x: ' '.join(map(str, x))
    )
    return df_movies



def get_tfidfMatrix(df_movies: pd.DataFrame) -> pd.DataFrame:

    # removing stopwords
    tfidf = TfidfVectorizer(stop_words="english")

    # Replace NaN with an empty string
    df_movies["description"] = df_movies["description"].fillna("")

    # Construct the required TF-IDF matrix by fitting and transforming the data
    tfidf_matrix = tfidf.fit_transform(df_movies["description"])
    # Output the shape of tfidf_matrix
    tfidf_matrix.shape

    list(enumerate(tfidf.get_feature_names_out()))
    pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names_out(), index=df_movies.title)

    
    return tfidf_matrix



In [4]:

def get_similarity_matrix(tfidf_matrix: pd.DataFrame, df_movies: pd.DataFrame) -> pd.DataFrame:
    
    simmilarity = cosine_similarity(tfidf_matrix)
    cosine_simMatrix = pd.DataFrame( simmilarity )

    # cosine_simMatrix_names = cosine_simMatrix.copy()
    # movie_titles = df_movies["title"]
    # cosine_simMatrix_names.columns = movie_titles
    # cosine_simMatrix_names.index = movie_titles


    movie_index = df_movies.index
    cosine_simMatrix.index = movie_index
    cosine_simMatrix.columns = movie_index
    return cosine_simMatrix
    
    
    
def get_most_similar_movies(idMovie: int, df_movies: pd.DataFrame, \
                                        sim_matrix: pd.DataFrame) -> pd.DataFrame:

    idMovie_sim = sim_matrix[idMovie-1]
    index = idMovie_sim.index
    values = idMovie_sim.values
    #crea dataframe con los datos
    df_idMovie_sim = pd.DataFrame({'idMovie': index, 'similarity score': values})
    df_similarity = df_idMovie_sim.sort_values(by='similarity score', \
                                            axis=0, ascending=False)
    df_similarity = df_similarity.reset_index(drop=True)
    # drop target movie sim. score with itself 
    df_similarity = df_similarity[1:]

    #top 10 most similar
    df_similarity = df_similarity[:5]
    #get its information
    df_recomendation = pd.merge(df_similarity, df_movies, how='left', on='idMovie')

    return df_recomendation


In [5]:

def get_recommendation(idMovie: int) -> tuple:
    ratings = load_data('ratings')
    movies = load_data('movies')
    
    df_movies = movies.copy()
    # treat movie data
    df_movies = movie_data_treatment(df_movies)
    # treat ratings data
    ratings = ratings_data_treatment(ratings)        
    #clean description
    df_movies = clean_descriptions(df_movies)
    #create tfidf matrix
    tfidf_matrix = get_tfidfMatrix(df_movies)
    #get similarity matrix
    df_similarity = get_similarity_matrix(tfidf_matrix, df_movies)
    #get mosrt similar movies to target movie
    df_similar_movies = get_most_similar_movies(idMovie, df_movies, df_similarity)
    
    return df_similar_movies


if __name__ == "__main__":
    movies_recommendation = get_recommendation(1)
    print(movies_recommendation)
    

   idMovie  similarity score                              title  \
0     2674           0.12534            The Legacy of the Bones   
1     2760           0.11750                   Tree House Tales   
2     2429           0.11741                 Overnight Republic   
3     2380           0.11374  An Evening with Beverly Luff Linn   
4     1731           0.10207                Wrong Kind of Black   

                                         description  
0  baztán close murders mystery detective faced r...  
1  social media lessons journey navigating friend...  
2  small wins war ensue candidate province countr...  
3  discovers reconnect anything role production l...  
4  encounters boori racial navigate brother named...  
