In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("rating.csv")

In [3]:
print("Ratings Data info: ")
data.info()

Ratings Data info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB


In [4]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
movies_data  = pd.read_csv("movie.csv")

In [6]:
print("Movies Data info: ")
movies_data.info()

Movies Data info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [7]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
# Sample 100 random records from the movies dataset
# The `random_state=42` ensures that the sample is reproducible
# Every time the code runs with this seed, the same 100 records will be sampled
movies_sample = movies_data.sample(n=100, random_state=42)

In [9]:
# Filter the ratings dataset to include only ratings for movies that are in the sampled movies dataset
# `ratings.movieId.isin(movies_sample.movieId)` creates a boolean mask to filter rows where movieId is in the sampled movies
# `.groupby('movieId').head(100)` ensures that only up to 100 ratings per movie␣are kept in case there are more than 100 ratings for a movie
ratings_sample = data[data.movieId.isin(movies_sample.movieId)].groupby('movieId').head(100)

In [10]:
# Merge the filtered ratings dataset with the sampled movies dataset
# This combines the ratings and movies DataFrames based on the common 'movieId' column
# The result is a DataFrame containing ratings along with movie titles and genres for the sampled movies
merged_df = pd.merge(ratings_sample, movies_sample, on='movieId')

In [11]:
# Print the first few rows of the merged DataFrame
# This allows us to verify the results of the merge operation and inspect the combined data
print(merged_df.head())

   userId  movieId  rating            timestamp  \
0       1     2021     4.0  2005-04-02 23:52:09   
1       3     2366     4.0  1999-12-11 13:18:30   
2       7     2195     2.0  2002-01-16 18:36:28   
3      11     1255     4.0  2009-01-01 05:00:24   
4      11    35836     5.0  2009-01-01 04:18:02   

                            title                           genres  
0                     Dune (1984)                 Adventure|Sci-Fi  
1                King Kong (1933)  Action|Adventure|Fantasy|Horror  
2               Dirty Work (1998)                           Comedy  
3                Bad Taste (1987)             Comedy|Horror|Sci-Fi  
4  40-Year-Old Virgin, The (2005)                   Comedy|Romance  


In [12]:
# Create a user-item matrix from the merged DataFrame
# This matrix has users as rows and movies as columns, with ratings as the values
# `index='userId'` specifies that rows should be indexed by user IDs
# `columns='movieId'` specifies that columns should be based on movie IDs
# `values='rating'` specifies that the matrix values should be the ratings
# `.fillna(0)` replaces missing values (NaNs) with 0, assuming unrated movies should have a rating of 0
user_item_matrix = merged_df.pivot_table(index='userId', columns='movieId',values='rating').fillna(0)

In [13]:
# Convert the user-item matrix (a DataFrame) to a NumPy array
# This is done to facilitate matrix factorization, which often works with NumPy arrays
# `user_item_matrix.values` extracts the underlying NumPy array from the DataFrame
R = user_item_matrix.values

In [14]:
# Print information about the user-item matrix
# This includes details such as the data types, number of entries, and memory usage
# It helps in verifying the structure and content of the matrix
print("User-Item Matrix Info:")
print(user_item_matrix.info())

User-Item Matrix Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2801 entries, 1 to 138339
Data columns (total 99 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   585     2801 non-null   float64
 1   1110    2801 non-null   float64
 2   1255    2801 non-null   float64
 3   1488    2801 non-null   float64
 4   1496    2801 non-null   float64
 5   1645    2801 non-null   float64
 6   2021    2801 non-null   float64
 7   2043    2801 non-null   float64
 8   2195    2801 non-null   float64
 9   2366    2801 non-null   float64
 10  3455    2801 non-null   float64
 11  3869    2801 non-null   float64
 12  4281    2801 non-null   float64
 13  4514    2801 non-null   float64
 14  4755    2801 non-null   float64
 15  4834    2801 non-null   float64
 16  5515    2801 non-null   float64
 17  5848    2801 non-null   float64
 18  6219    2801 non-null   float64
 19  6590    2801 non-null   float64
 20  7621    2801 non-null   float64
 21  8380    2801 non-

In [15]:
# Print the first few rows of the user-item matrix
# This provides a snapshot of the matrix, showing how users and movies are represented and the ratings assigned
# It helps to quickly verify the structure and initial data of the matrix
print("\nUser-Item Matrix Head:")
print(user_item_matrix.head())


User-Item Matrix Head:
movieId  585     1110    1255    1488    1496    1645    2021    2043    \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     4.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
7           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
11          0.0     0.0     4.0     0.0     0.0     0.0     0.0     0.0   
12          3.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  2195    2366    ...  119161  120124  124430  126064  126245  127100  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     4.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
7           2.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
11          0.0     0.0  ...     0.0     0.0     0

In [16]:
def matrix_factorization(R, K, steps=50, alpha=0.0002, beta=0.02):
    """
    Perform matrix factorization using stochastic gradient descent.
    Parameters:
    R (numpy array): User-item rating matrix (user x item).
    K (int): Number of latent features.
    steps (int): Number of iterations for the optimization.
    alpha (float): Learning rate.
    beta (float): Regularization parameter.
    Returns:
    P (numpy array): User feature matrix.
    Q (numpy array): Item feature matrix.
    """
    # Number of users (rows) and items (columns) in the rating matrix
    N = len(R)
    M = len(R[0])
    
    # Initialize user and item feature matrices with random values
    P = np.random.rand(N, K)  # User feature matrix (N x K)
    Q = np.random.rand(M, K)  # Item feature matrix (M x K)
    
    # Perform optimization over a number of steps
    for step in range(steps):
        # Update user and item matrices
        for i in range(N):  # Loop over each user
            for j in range(M):  # Loop over each item
                if R[i][j] > 0:  # Only consider non-zero ratings
                    # Compute prediction error for the current user-item pair
                    eij = R[i][j] - np.dot(P[i, :], Q[j, :].T)
                    # Update user and item feature matrices
                    for k in range(K):  # Loop over each latent feature
                        P[i][k] += alpha * (2 * eij * Q[j][k] - beta * P[i][k])
                        Q[j][k] += alpha * (2 * eij * P[i][k] - beta * Q[j][k])
        
        # Compute the total error for the current iteration
        error = 0
        for i in range(N):  # Loop over each user
            for j in range(M):  # Loop over each item
                if R[i][j] > 0:  # Only consider non-zero ratings
                    # Compute the squared error for the current user-item pair
                    error += pow(R[i][j] - np.dot(P[i, :], Q[j, :].T), 2)
                    # Add regularization terms to the error
                    for k in range(K):  # Loop over each latent feature
                        error += (beta / 2) * (pow(P[i][k], 2) + pow(Q[j][k], 2))
        
        # Break the loop if error is sufficiently low
        if error < 0.001:
            break
    
    # Return the final user and item feature matrices
    return P, Q

In [17]:
# Number of latent features
K = 2 # Set the number of latent features (dimensionality of the feature vectors) to 2
# Perform matrix factorization
P, Q = matrix_factorization(R, K, steps=50)
# Call the matrix_factorization function to factorize the user-item matrix R into
# user feature matrix P and item feature matrix Q using 2 latent features.
# The function will run for 50 iterations.

In [18]:
# Calculate predicted ratings
predicted_R = np.dot(P, Q.T)

In [19]:
print("\nPredicted user-item rating matrix:")
print(predicted_R)


Predicted user-item rating matrix:
[[2.72480392 2.23052504 3.8770461  ... 1.02955336 0.91750729 1.19881995]
 [1.79214457 1.35147532 2.83729464 ... 0.85584623 0.88645066 0.5588164 ]
 [1.26126379 0.96954757 1.95103518 ... 0.5738497  0.57876982 0.42987395]
 ...
 [0.91096341 0.73216845 1.32985844 ... 0.36514738 0.33991261 0.37387369]
 [2.05157346 1.62423785 3.05629725 ... 0.8604931  0.82592784 0.79296886]
 [1.01850733 0.8913433  1.30603663 ... 0.29579151 0.20193563 0.56255483]]
