In [24]:
#Access Google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score



# Load datasets
def load_datasets():
    ratingDataset = pd.read_csv('/content/drive/MyDrive/42913 SIN/ua.base', sep='\t', header=None, names=['user id', 'item id', 'rating', 'timestamp'])
    userDataset = pd.read_csv('/content/drive/MyDrive/42913 SIN/u.user', sep='|', header=None, names=['user id', 'age', 'gender', 'occupation', 'zip code'])
    movieDataset = pd.read_csv('/content/drive/MyDrive/42913 SIN/u.item', sep='|', header=None, encoding='latin1', names=['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown cat', 'Action', 'Adventure', 'Animation', ' Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical','Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
    return ratingDataset, userDataset, movieDataset

# Check missing values
def check_missing_values(datasets):
    ratingDataset, userDataset, movieDataset = datasets
    print(ratingDataset.isnull().sum())
    print('---------------------------')
    print(userDataset.isnull().sum())
    print('---------------------------')
    print(movieDataset.isnull().sum())
    print('---------------------------')

# Perform Singular Value Decomposition (SVD)
def perform_svd(user_item_matrix):
    U, Sigma, Vt = np.linalg.svd(user_item_matrix, full_matrices=False)
    return U, Sigma, Vt

# Collaborative Filtering - Predict rating
def predict_rating(user_id, movie_id, user_item_matrix, user_similarities,k=10):
    similar_users_ratings = user_item_matrix.loc[:, movie_id]
    similar_users = similar_users_ratings.sort_values(ascending=False).index[:k]
    weighted_sum = 0
    total_similarity = 0
    for similar_user_id in similar_users:
        similarity = user_similarities[user_id][similar_user_id]
        rating = user_item_matrix.loc[similar_user_id, movie_id]
        weighted_sum += similarity * rating
        total_similarity += similarity
    if total_similarity == 0:
        return 0
    predicted_rating = weighted_sum / total_similarity
    return predicted_rating


# Collaborative Filtering - Recommend movies
def recommend_movies_CF(user_id, user_item_matrix, user_similarities, movie_id_to_name, num_recommendations=5):
    similar_users = sorted(list(enumerate(user_similarities[user_id])), key=lambda x: x[1], reverse=True)
    rated_movies = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    recommended_movies = []
    for user, similarity in similar_users:
        if len(recommended_movies) >= num_recommendations:
            break
        if user != user_id:
            movies_rated_by_similar_user = user_item_matrix.loc[user][user_item_matrix.loc[user] > 0].index
            for movie in movies_rated_by_similar_user:
                if movie not in rated_movies:
                    predicted_rating = predict_rating(user_id, movie, user_item_matrix, user_similarities)
                    recommended_movies.append((movie, predicted_rating))
                    if len(recommended_movies) >= num_recommendations:
                        break
    return recommended_movies


# Content-Based Filtering - Preprocess movie features
def preprocess_movie_features(movieDataset):
    movie_descriptions = [' '.join(map(str, row[1:])) for _, row in movieDataset.iterrows()]
    return movie_descriptions

# Content-Based Filtering - Recommend movies
def recommend_movies_CB(movie_title, cosine_sim_matrix, movieDataset, movie_id_to_name, top_n=5):
    idx = movieDataset[movieDataset['movie title'] == movie_title].index.values[0]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]
    recommended_movies = [(movie_id_to_name[movieDataset.iloc[idx]['movie id']], sim_score) for idx, sim_score in sim_scores]
    return recommended_movies

# Define RNN model architecture
def build_rnn_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

# Map movie ID to movie name
def map_movie_id_to_name(movieDataset):
    movie_id_to_name = {}
    for index, row in movieDataset.iterrows():
        movie_id = row['movie id']
        movie_title = row['movie title']
        movie_id_to_name[movie_id] = movie_title
    return movie_id_to_name

# Split the dataset into train and test sets
def split_data(user_item_matrix):
    X = user_item_matrix.iloc[:, :-1].values  # Features (user-item interactions)
    y = user_item_matrix.iloc[:, -1].values   # Target ratings

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Function to evaluate Collaborative Filtering
def evaluate_collaborative_filtering(actual_ratings, predicted_ratings, threshold=3.5):
    # Calculate RMSE
    rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)
    print("Root Mean Squared Error (RMSE):", rmse)

    # Calculate MAE
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    print("Mean Absolute Error (MAE):", mae)

    # Convert ratings to binary for Precision and Recall
    actual_binary = (actual_ratings > threshold).astype(int)
    predicted_binary = (predicted_ratings > threshold).astype(int)

    # Calculate Precision
    precision = precision_score(actual_binary, predicted_binary)
    print("Precision:", precision)

    # Calculate Recall
    recall = recall_score(actual_binary, predicted_binary)
    print("Recall:", recall)

    # Calculate F1 Score
    f1 = f1_score(actual_binary, predicted_binary)
    print("F1 Score:", f1)

    # Calculate Hit Rate
    hit_rate = sum((actual_binary == 1) & (predicted_binary == 1)) / sum(actual_binary == 1)
    print("Hit Rate:", hit_rate)

# Main function
def main():
    ratingDataset, userDataset, movieDataset = load_datasets()
    #check_missing_values((ratingDataset, userDataset, movieDataset))

    # Collaborative Filtering
    k = 10
    user_similarities = cosine_similarity(user_item_matrix)
    movie_id_to_name = map_movie_id_to_name(movieDataset)
    user_id = 1
    recommended_movies_CF = recommend_movies_CF(user_id, user_item_matrix, user_similarities, movie_id_to_name)
    print("Recommended movies using Collaborative Filtering for User", user_id, ":")
    for movie_id, predicted_rating in recommended_movies_CF:
        movie_title = movie_id_to_name.get(movie_id, "Unknown")
        print(movie_title, "-", "Predicted Rating:", predicted_rating)
    # Evaluate Collaborative Filtering
    #evaluate_collaborative_filtering(y_test, y_pred)

    # Content-Based Filtering
    movie_descriptions = preprocess_movie_features(movieDataset)
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(movie_descriptions)
    cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    recommended_movies_CB = recommend_movies_CB('Toy Story (1995)', cosine_sim_matrix, movieDataset, movie_id_to_name)
    print("\nRecommended movies using Content-Based Filtering for 'Toy Story (1995)':")
    for movie_title, similarity_score in recommended_movies_CB:
        print(movie_title, "-", "Similarity Score:", similarity_score)

if __name__ == "__main__":
    main()


Recommended movies using Collaborative Filtering for User 1 :
Rock, The (1996) - Predicted Rating: 5.0
Delicatessen (1991) - Predicted Rating: 5.000000000000001
Hunt for Red October, The (1990) - Predicted Rating: 5.0
Sabrina (1995) - Predicted Rating: 5.0
Sense and Sensibility (1995) - Predicted Rating: 4.999999999999999

Recommended movies using Content-Based Filtering for 'Toy Story (1995)':
Pyromaniac's Love Story, A (1995) - Similarity Score: 0.36556028709520605
Fear, The (1995) - Similarity Score: 0.2510364915084959
My Family (1995) - Similarity Score: 0.24019518012909064
Show, The (1995) - Similarity Score: 0.234401494688814
Sabrina (1995) - Similarity Score: 0.232498494224088


In [26]:
ratingDataset, userDataset, movieDataset = load_datasets()
check_missing_values((ratingDataset, userDataset, movieDataset))

user id      0
item id      0
rating       0
timestamp    0
dtype: int64
---------------------------
user id       0
age           0
gender        0
occupation    0
zip code      0
dtype: int64
---------------------------
movie id                 0
movie title              0
release date             1
video release date    1682
IMDb URL                 3
unknown cat              0
Action                   0
Adventure                0
Animation                0
 Childrens               0
Comedy                   0
Crime                    0
Documentary              0
Drama                    0
Fantasy                  0
Film-Noir                0
Horror                   0
Musical                  0
Mystery                  0
Romance                  0
Sci-Fi                   0
Thriller                 0
War                      0
Western                  0
dtype: int64
---------------------------


In [39]:
# SVD
merged_data = pd.merge(ratingDataset, userDataset, on='user id')
merged_data = pd.merge(merged_data, movieDataset, left_on='item id', right_on='movie id')
user_item_matrix = pd.pivot_table(merged_data, values='rating', index='user id', columns='item id', fill_value=0)
U, Sigma, Vt = perform_svd(user_item_matrix)

# Convert user_item_matrix DataFrame to NumPy array
user_item_array = user_item_matrix.values

# Get the indices of non-zero ratings
nonzero_indices = np.argwhere(user_item_array > 0)

# Split the indices into training and testing sets
train_indices, test_indices = train_test_split(nonzero_indices, test_size=0.2, random_state=42)

# Create training and testing data arrays
X_train = train_indices[:, 0], train_indices[:, 1]  # user ID, item ID
y_train = user_item_array[train_indices[:, 0], train_indices[:, 1]]  # ratings

X_test = test_indices[:, 0], test_indices[:, 1]  # user ID, item ID
y_test = user_item_array[test_indices[:, 0], test_indices[:, 1]]  # ratings

# Split data into train and test sets
#X_train, X_test, y_train, y_test = split_data(user_item_matrix)

# Reshape input data for LSTM
#X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
#X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [28]:
print("RNN:")

# Build and train the RNN model
model = build_rnn_model(input_shape=(X_train.shape[1], 1))
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Predict ratings using the trained RNN model
y_pred = model.predict(X_test)

# RNN metrics
print("Evaluate RNN:")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", mse)

# Predicted ratings above a certain threshold are considered positive predictions
threshold = 2.5
positive_predictions = y_pred > threshold

# Actual ratings above a certain threshold are considered relevant items
relevant_items = y_test > threshold

# Calculate Precision
if np.any(positive_predictions):
    precision = sum(positive_predictions & relevant_items) / sum(positive_predictions)
else:
    precision = 0.0
print("Precision:", precision)

# Calculate Recall
if np.any(relevant_items):
    recall = sum(positive_predictions & relevant_items) / sum(relevant_items)
else:
    recall = 0.0
print("Recall:", recall)

# Calculate F1 Score
if np.any(relevant_items) and np.any(positive_predictions):
  f1 = f1_score(relevant_items, positive_predictions)
else:
  f1 = 0.0
print("F1:", f1)

# Calculate Hit Rate
if np.any(relevant_items):
    hit_rate = sum(positive_predictions & relevant_items) / sum(relevant_items)
else:
    hit_rate = 0.0
print("Hit Rate:", hit_rate)

RNN:
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluate RNN:
Mean Squared Error (MSE): 1.800735051760878e-05
Root Mean Squared Error (RMSE): 1.800735051760878e-05
Precision: 0.0
Recall: 0.0
F1: 0.0
Hit Rate: 0.0


In [43]:
import tensorflow as tf
# Implement NCF model
def build_ncf_model(num_users, num_items, latent_dim=64):
    # Input layers
    user_input = tf.keras.layers.Input(shape=(1,))
    item_input = tf.keras.layers.Input(shape=(1,))

    # User embedding layer
    user_embedding = tf.keras.layers.Embedding(input_dim=num_users, output_dim=latent_dim)(user_input)
    user_embedding = tf.keras.layers.Flatten()(user_embedding)

    # Item embedding layer
    item_embedding = tf.keras.layers.Embedding(input_dim=num_items, output_dim=latent_dim)(item_input)
    item_embedding = tf.keras.layers.Flatten()(item_embedding)

    # Concatenate user and item embeddings
    concat = tf.keras.layers.Concatenate()([user_embedding, item_embedding])

    # Fully connected layers
    dense1 = tf.keras.layers.Dense(64, activation='relu')(concat)
    dense2 = tf.keras.layers.Dense(32, activation='relu')(dense1)

    # Output layer
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense2)

    # Create model
    model = tf.keras.Model(inputs=[user_input, item_input], outputs=output)

    # Compile model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Get number of unique users and items
num_users = ratingDataset['user id'].nunique()
num_items = ratingDataset['item id'].nunique()


# Build NCF model
ncf_model = build_ncf_model(num_users, num_items)

# Train NCF model
ncf_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Make predictions using NCF model
predictions = ncf_model.predict(X_test)


# Calculate evaluation metrics for NCF
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
precision = precision_score(y_test > 3, predictions > 0.5)
recall = recall_score(y_test > 3, predictions > 0.5)
f1 = f1_score(y_test > 3, predictions > 0.5)

# Print evaluation metrics for NCF
print("NCF Metrics:")
print("RMSE:", np.sqrt(mse))
print("MAE:", mae)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
NCF Metrics:
RMSE: 2.7762418925665924
MAE: 2.5368223473556366
Precision: 0.5561444186816826
Recall: 1.0
F1 Score: 0.7147722435078756
