In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv
/kaggle/input/alx-movie-recommendation-project-2024/movies.csv
/kaggle/input/alx-movie-recommendation-project-2024/imdb_data.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/genome_scores.csv
/kaggle/input/alx-movie-recommendation-project-2024/train.csv
/kaggle/input/alx-movie-recommendation-project-2024/test.csv
/kaggle/input/alx-movie-recommendation-project-2024/tags.csv
/kaggle/input/alx-movie-recommendation-project-2024/links.csv


<h3>Load Datasets</h3>

In [2]:

train = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/train.csv')
test = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/test.csv')
submission = pd.read_csv('/kaggle/input/alx-movie-recommendation-project-2024/sample_submission.csv')

<h3>Exploratory Data Analysis </h3>

In [3]:

print(train.head())
print(test.head())

   userId  movieId  rating   timestamp
0    5163    57669     4.0  1518349992
1  106343        5     4.5  1206238739
2  146790     5459     5.0  1076215539
3  106362    32296     2.0  1423042565
4    9041      366     3.0   833375837
   userId  movieId
0       1     2011
1       1     4144
2       1     5767
3       1     6711
4       1     7318


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000038 entries, 0 to 10000037
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 305.2 MB


In [5]:
train.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

<h3>Collaborative Filtering</h3>

In [6]:
from surprise import SVD,Dataset,Reader,accuracy
from tqdm import tqdm

In [7]:
#train_df
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(train[['userId','movieId','rating']],reader)
train_set = train_data.build_full_trainset()

In [8]:
svd = SVD()
svd.fit(train_set)

# test_user_pred_pairs = list(zip(test['userId'],test['movieId']))
# test_predict = [svd.predict(uid,mid).est for uid,mid in test_user_pred_pairs]

# test['predicted_rating'] = test_predict
# print(test.head())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7d1900889ed0>

In [9]:
def predict_in_batches(df, algo, batch_size=100000):
    import pandas as pd
    from tqdm import tqdm

    num_rows = len(df)
    num_batches = (num_rows // batch_size) + 1
    start_index = 0

    # List to store predictions
    predictions_list = []

    for batch_num in tqdm(range(num_batches), desc="Processing batches"):
        end_index = min(start_index + batch_size, num_rows)
        batch_df = df.iloc[start_index:end_index].copy()  # Explicitly make a copy

        # Generate predictions for the batch
        batch_predictions = [algo.predict(user, movie).est for user, movie in zip(batch_df['userId'], batch_df['movieId'])]

        # Add batch predictions to DataFrame
        batch_df['rating'] = batch_predictions  # Changed from 'predict_ratings' to 'ratings'

        # Append the batch DataFrame to the list
        predictions_list.append(batch_df)

        # Update the start index for the next batch
        start_index = end_index

        # Save intermediate submission file
        submission_batch_df = batch_df[['userId', 'movieId', 'rating']].copy()  # Changed from 'predict_ratings' to 'ratings'
        submission_batch_df['Id'] = submission_batch_df['userId'].astype(str) + '_' + submission_batch_df['movieId'].astype(str)
        submission_batch_df = submission_batch_df[['Id', 'rating']]  # Changed from 'predict_ratings' to 'ratings'
        submission_batch_df.to_csv('submission.csv', mode='a', index=False, header=batch_num == 0)
        print(f"Updated submission.csv with batch {batch_num + 1}/{num_batches}")

    # Concatenate all batches into a single DataFrame
    predictions_df = pd.concat(predictions_list, ignore_index=True)

    return predictions_df




In [10]:
# test_subset = test.sample(n=1000,random_state=42)
predictions_df = predict_in_batches(test,svd)
predictions_df.head()

Processing batches:   2%|▏         | 1/51 [00:00<00:44,  1.11it/s]

Updated submission.csv with batch 1/51


Processing batches:   4%|▍         | 2/51 [00:01<00:43,  1.12it/s]

Updated submission.csv with batch 2/51


Processing batches:   6%|▌         | 3/51 [00:02<00:43,  1.11it/s]

Updated submission.csv with batch 3/51


Processing batches:   8%|▊         | 4/51 [00:03<00:42,  1.11it/s]

Updated submission.csv with batch 4/51


Processing batches:  10%|▉         | 5/51 [00:04<00:41,  1.11it/s]

Updated submission.csv with batch 5/51


Processing batches:  12%|█▏        | 6/51 [00:05<00:40,  1.10it/s]

Updated submission.csv with batch 6/51


Processing batches:  14%|█▎        | 7/51 [00:06<00:39,  1.11it/s]

Updated submission.csv with batch 7/51


Processing batches:  16%|█▌        | 8/51 [00:07<00:38,  1.11it/s]

Updated submission.csv with batch 8/51


Processing batches:  18%|█▊        | 9/51 [00:08<00:37,  1.11it/s]

Updated submission.csv with batch 9/51


Processing batches:  20%|█▉        | 10/51 [00:09<00:36,  1.11it/s]

Updated submission.csv with batch 10/51


Processing batches:  22%|██▏       | 11/51 [00:09<00:35,  1.11it/s]

Updated submission.csv with batch 11/51


Processing batches:  24%|██▎       | 12/51 [00:10<00:34,  1.12it/s]

Updated submission.csv with batch 12/51


Processing batches:  25%|██▌       | 13/51 [00:11<00:33,  1.12it/s]

Updated submission.csv with batch 13/51


Processing batches:  27%|██▋       | 14/51 [00:12<00:33,  1.12it/s]

Updated submission.csv with batch 14/51


Processing batches:  29%|██▉       | 15/51 [00:13<00:32,  1.12it/s]

Updated submission.csv with batch 15/51


Processing batches:  31%|███▏      | 16/51 [00:14<00:31,  1.12it/s]

Updated submission.csv with batch 16/51


Processing batches:  33%|███▎      | 17/51 [00:15<00:30,  1.11it/s]

Updated submission.csv with batch 17/51


Processing batches:  35%|███▌      | 18/51 [00:16<00:29,  1.11it/s]

Updated submission.csv with batch 18/51


Processing batches:  37%|███▋      | 19/51 [00:17<00:28,  1.11it/s]

Updated submission.csv with batch 19/51


Processing batches:  39%|███▉      | 20/51 [00:17<00:27,  1.12it/s]

Updated submission.csv with batch 20/51


Processing batches:  41%|████      | 21/51 [00:18<00:26,  1.12it/s]

Updated submission.csv with batch 21/51


Processing batches:  43%|████▎     | 22/51 [00:19<00:26,  1.08it/s]

Updated submission.csv with batch 22/51


Processing batches:  45%|████▌     | 23/51 [00:20<00:25,  1.09it/s]

Updated submission.csv with batch 23/51


Processing batches:  47%|████▋     | 24/51 [00:21<00:24,  1.10it/s]

Updated submission.csv with batch 24/51


Processing batches:  49%|████▉     | 25/51 [00:22<00:23,  1.11it/s]

Updated submission.csv with batch 25/51


Processing batches:  51%|█████     | 26/51 [00:23<00:22,  1.11it/s]

Updated submission.csv with batch 26/51


Processing batches:  53%|█████▎    | 27/51 [00:24<00:21,  1.11it/s]

Updated submission.csv with batch 27/51


Processing batches:  55%|█████▍    | 28/51 [00:25<00:20,  1.10it/s]

Updated submission.csv with batch 28/51


Processing batches:  57%|█████▋    | 29/51 [00:26<00:19,  1.10it/s]

Updated submission.csv with batch 29/51


Processing batches:  59%|█████▉    | 30/51 [00:27<00:18,  1.11it/s]

Updated submission.csv with batch 30/51


Processing batches:  61%|██████    | 31/51 [00:27<00:18,  1.10it/s]

Updated submission.csv with batch 31/51


Processing batches:  63%|██████▎   | 32/51 [00:28<00:17,  1.09it/s]

Updated submission.csv with batch 32/51


Processing batches:  65%|██████▍   | 33/51 [00:29<00:16,  1.09it/s]

Updated submission.csv with batch 33/51


Processing batches:  67%|██████▋   | 34/51 [00:30<00:15,  1.09it/s]

Updated submission.csv with batch 34/51


Processing batches:  69%|██████▊   | 35/51 [00:31<00:14,  1.07it/s]

Updated submission.csv with batch 35/51


Processing batches:  71%|███████   | 36/51 [00:32<00:13,  1.08it/s]

Updated submission.csv with batch 36/51


Processing batches:  73%|███████▎  | 37/51 [00:33<00:12,  1.08it/s]

Updated submission.csv with batch 37/51


Processing batches:  75%|███████▍  | 38/51 [00:34<00:11,  1.08it/s]

Updated submission.csv with batch 38/51


Processing batches:  76%|███████▋  | 39/51 [00:35<00:11,  1.08it/s]

Updated submission.csv with batch 39/51


Processing batches:  78%|███████▊  | 40/51 [00:36<00:10,  1.09it/s]

Updated submission.csv with batch 40/51


Processing batches:  80%|████████  | 41/51 [00:37<00:09,  1.08it/s]

Updated submission.csv with batch 41/51


Processing batches:  82%|████████▏ | 42/51 [00:38<00:08,  1.08it/s]

Updated submission.csv with batch 42/51


Processing batches:  84%|████████▍ | 43/51 [00:39<00:07,  1.09it/s]

Updated submission.csv with batch 43/51


Processing batches:  86%|████████▋ | 44/51 [00:39<00:06,  1.09it/s]

Updated submission.csv with batch 44/51


Processing batches:  88%|████████▊ | 45/51 [00:40<00:05,  1.09it/s]

Updated submission.csv with batch 45/51


Processing batches:  90%|█████████ | 46/51 [00:41<00:04,  1.10it/s]

Updated submission.csv with batch 46/51


Processing batches:  92%|█████████▏| 47/51 [00:42<00:03,  1.10it/s]

Updated submission.csv with batch 47/51


Processing batches:  94%|█████████▍| 48/51 [00:43<00:02,  1.10it/s]

Updated submission.csv with batch 48/51


Processing batches:  96%|█████████▌| 49/51 [00:44<00:01,  1.09it/s]

Updated submission.csv with batch 49/51


Processing batches: 100%|██████████| 51/51 [00:45<00:00,  1.12it/s]

Updated submission.csv with batch 50/51
Updated submission.csv with batch 51/51





Unnamed: 0,userId,movieId,rating
0,1,2011,3.613644
1,1,4144,4.093794
2,1,5767,3.530623
3,1,6711,3.808168
4,1,7318,3.326429
