In [None]:
import pandas as pd
!pip install scikit-surprise
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

df = pd.read_csv('train.csv')

# Check the first few rows to ensure it’s loaded correctly
print(df.head())

# Define the rating scale (0 to 10 as per your data)
reader = Reader(rating_scale=(0, 10))

# Load the data into Surprise’s format (only need user_id, anime_id, rating)
data = Dataset.load_from_df(df[['user_id', 'anime_id', 'rating']], reader)

   Unnamed: 0  user_id  anime_id  rating  watching_status  watched_episodes
0           0    20807      4282      10                2                 1
1           1    20807      5665       0                6                 0
2           2    20807     36127       0                6                 0
3           3    20807     38163       0                6                 0
4           4    20807      1284       0                6                 0


In [None]:
from surprise.model_selection import GridSearchCV

param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [10 , 20, 30, 40],
    'lr_all': [0.0001, 0.00005],
    'reg_all': [0.001, 0.0001]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)
print(gs.best_params['rmse'])  # Best parameters
print(gs.best_score['rmse'])   # Best RM

{'n_factors': 50, 'n_epochs': 40, 'lr_all': 0.0001, 'reg_all': 0.0001}
3.407479937408079


In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Initialize SVD with 50 latent factors (you can tweak this later)
model = SVD(
    n_factors=50,        # Moderate number of factors for sparsity
    n_epochs=40,         # More iterations for better convergence
    lr_all=0.0001,        # Default learning rate
    reg_all=0.0001,        # Slightly higher regularization for sparsity
    random_state=42      # Reproducibility
)

# Train the model on the training set
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e1ef09e0710>

In [None]:
# Make predictions on the test set
predictions = model.test(testset)

# Compute and print RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 3.3569
RMSE: 3.3569380700575038


In [None]:
unique_users = df['user_id'].unique()
all_anime_ids = df['anime_id'].unique()

# Step 5: Function to get top 5 anime for a user
def get_top_n_recommendations(model, user_id, anime_ids, trainset, n=5):
    # Get anime the user has already rated (to exclude them)
    rated_anime = set([iid for (iid, _) in trainset.ur[trainset.to_inner_uid(user_id)]])
    # Filter out already rated anime
    unrated_anime = [aid for aid in anime_ids if aid not in rated_anime]
    # Predict ratings for unrated anime
    predictions = [model.predict(user_id, anime_id) for anime_id in unrated_anime]
    # Sort by predicted rating (highest first)
    predictions.sort(key=lambda x: x.est, reverse=True)
    # Return top N anime_ids as a space-separated string
    top_n = [str(pred.iid) for pred in predictions[:n]]
    return " ".join(top_n)

In [None]:
recommendations = []
for user_id in unique_users:
    top_5_anime = get_top_n_recommendations(model, user_id, all_anime_ids, trainset, n=5)
    recommendations.append([user_id, top_5_anime])

In [None]:
rec_df = pd.DataFrame(recommendations, columns=['user_id', 'anime_ids'])

# Step 8: Save to CSV
rec_df.to_csv('top_5_anime_recommendations.csv', index=False)
print("Recommendations saved to 'top_5_anime_recommendations.csv'")

Recommendations saved to 'top_5_anime_recommendations.csv'
