In [7]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from scipy.sparse import linalg
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import surprise


In [8]:


pd.set_option('display.float_format', lambda x: '%.5f' % x)


data_path = {
    'ratings': 'ratings.txt',
    'movies': 'netflix_titles.csv',
    'users': 'users.txt'
}

In [9]:


def load_data(data: str) -> pd.DataFrame:
    """
    Loads data from specified csv

    Args:
        data (str): name of the data 

    Returns:
        pd.DataFrame: dataframe with data loaded
    """
    df = pd.read_csv(data_path[data], \
                        sep=',', \
                        engine='python')

    return df




def movie_data_treatment(df_movies: pd.DataFrame) -> pd.DataFrame:
    """
    Creates custom id column and drops unnecessary columns

    Args:
        df_movies (pd.DataFrame): movies data 

    Returns:
        pd.DataFrame: dataframe with treated data
    """    
    # creates custom id column
    df_movies['idMovie'] = [i for i in range(1, len(df_movies) + 1)]
    # first 3952 movies
    movies_id = [i for i in range(0, 3952)]
    df_movies = df_movies.iloc[movies_id]
    # selects necessary columns
    movie_keep_cols = ['title', 'idMovie', 'type']
    df_movies = df_movies[movie_keep_cols]
    
    return df_movies
    
    

In [10]:

    
def ratings_data_treatment(df_ratings: pd.DataFrame) -> pd.DataFrame:
    # drop timestamp column
    df_ratings = df_ratings.drop('timestamp', axis=1)
    return df_ratings




def create_rating_matrix(df_movies: pd.DataFrame, df_ratings: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(df_ratings, df_movies, on='idMovie', how='left')
    # crea matriz de valoraciones
    ratings_matrix = df.pivot( index = "idUser", columns = "idMovie", values = "rating")
    # 0 si no se ha valorado la pelicula
    ratings_matrix.fillna( 0, inplace = True ) 
    # elimina posibles duplicados
    ratings_matrix = ratings_matrix.loc[:, ~ratings_matrix.columns.duplicated()]
    
    return ratings_matrix


In [11]:

movies = load_data('movies')
ratings = load_data('ratings')





movies = movie_data_treatment(movies)
ratings = ratings_data_treatment(ratings)

rating_matrix = create_rating_matrix(movies, ratings)

train, test = train_test_split(rating_matrix, random_state=42, test_size=0.2, shuffle=True)

In [13]:
rating_matrix

idMovie,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
idUser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
2,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
3,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
4,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
5,0.00000,0.00000,0.00000,0.00000,0.00000,2.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.00000,0.00000,0.00000,2.00000,0.00000,3.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
6037,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
6038,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000
6039,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000


In [None]:
X = csr_matrix(train)
svd = TruncatedSVD(n_components=1250, n_iter=7, random_state=42)
trained_svd = svd.fit(X)
print(svd.explained_variance_ratio_.sum())

svd_reduction = trained_svd.transform(X)

recovered_data = trained_svd.inverse_transform(svd_reduction)
df_prediction = pd.DataFrame(recovered_data, columns=train)