In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test = pd.read_csv('data/test_set_no_ratings.csv')
# 1. Load Data
train = pd.read_csv('data/train_ratings.csv')
movies = pd.read_csv('data/movies.csv')

# 2. Pre-processing: Convert genres into a binary encoded vector
movies['genres'] = movies['genres'].str.split('|')
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=movies.movieId)

# Merge genres data with main data
train = train.merge(genres_df, left_on='movieId', right_index=True)
test = test.merge(genres_df, left_on='movieId', right_index=True)


# 3. Encode users and movies as integer indices
user_enc = LabelEncoder()
train['user'] = user_enc.fit_transform(train['userId'])
test['user'] = user_enc.transform(test['userId'])

In [2]:
movie_enc = LabelEncoder()
all_movies = movies['movieId'].unique().tolist()
movie_enc.fit(all_movies)
train['movie'] = movie_enc.transform(train['movieId'])
test['movie'] = movie_enc.transform(test['movieId'])

In [3]:
test

Unnamed: 0,Id,userId,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,user,movie
0,0,432,77866,0,1,1,0,0,0,0,...,0,0,0,1,0,0,1,0,431,7333
1,1,288,474,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,287,412
196,196,285,474,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,284,412
539,539,599,474,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,598,412
1789,1789,447,474,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,446,412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20147,20147,509,103042,0,1,1,0,0,0,0,...,1,0,0,0,1,0,0,0,508,8183
20153,20153,212,140715,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,211,9024
20158,20158,522,27006,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,521,5608
20160,20160,599,229,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,598,195


In [4]:
# 4. Split the data
X = train[['user', 'movie'] + mlb.classes_.tolist()]
y = train['rating']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.long).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.long).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).to(device)

# 5. Build the Model
class Recommender(nn.Module):
    def __init__(self, num_users, num_movies, num_genres, emb_size):
        super(Recommender, self).__init__()
        
        self.user_embedding = nn.Embedding(num_users, emb_size)
        self.movie_embedding = nn.Embedding(num_movies, emb_size)
        self.fc = nn.Sequential(
            nn.Linear(emb_size*2 + num_genres, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        user_input = x[:, 0]
        movie_input = x[:, 1]
        genres_input = x[:, 2:].float()
        
        user_emb = self.user_embedding(user_input)
        movie_emb = self.movie_embedding(movie_input)
        
        concat = torch.cat([user_emb, movie_emb, genres_input], dim=1)
        out = self.fc(concat)
        
        return out.squeeze()

num_users = len(user_enc.classes_)
num_movies = len(movie_enc.classes_)

# 6. Train the Model
model = Recommender(num_users, num_movies, len(mlb.classes_), 15).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [5]:
# Training loop
epochs = 750
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    # Validate
    model.eval()
    with torch.no_grad():
        val_predictions = model(X_val_tensor)
        val_loss = criterion(val_predictions, y_val_tensor)
    
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}")

# 7. Evaluate
model.eval()
with torch.no_grad():
    val_predictions = model(X_val_tensor)
    val_predictions_clipped = torch.clamp(val_predictions, 0.5, 5.0)
    rmse = np.sqrt(mean_squared_error(y_val_tensor.cpu(), val_predictions_clipped.cpu()))
    print(f"Validation RMSE: {rmse}")

Epoch 1/750, Training Loss: 13.077527046203613, Validation Loss: 12.596870422363281
Epoch 2/750, Training Loss: 12.717727661132812, Validation Loss: 12.25339412689209
Epoch 3/750, Training Loss: 12.36933708190918, Validation Loss: 11.924880027770996
Epoch 4/750, Training Loss: 12.03852653503418, Validation Loss: 11.609776496887207
Epoch 5/750, Training Loss: 11.718239784240723, Validation Loss: 11.305730819702148
Epoch 6/750, Training Loss: 11.406681060791016, Validation Loss: 11.010274887084961
Epoch 7/750, Training Loss: 11.09707260131836, Validation Loss: 10.721116065979004
Epoch 8/750, Training Loss: 10.79733943939209, Validation Loss: 10.435470581054688
Epoch 9/750, Training Loss: 10.502847671508789, Validation Loss: 10.15091609954834
Epoch 10/750, Training Loss: 10.206491470336914, Validation Loss: 9.86532211303711
Epoch 11/750, Training Loss: 9.91608715057373, Validation Loss: 9.577055931091309
Epoch 12/750, Training Loss: 9.61827278137207, Validation Loss: 9.284603118896484
Epo

KeyboardInterrupt: 

In [None]:
# Global average rating
global_avg_rating = train['rating'].mean()

# User-specific average rating
user_avg_ratings = train.groupby('userId')['rating'].mean().to_dict()


In [None]:
# Convert test data to tensor
X_test = test[['user', 'movie'] + mlb.classes_.tolist()]
X_test_tensor = torch.tensor(X_test.values, dtype=torch.long).to(device)

In [None]:
X_test_tensor.shape

torch.Size([20168, 22])

In [None]:
# Predict for the test set
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    # Clip the predictions between 0.5 and 5.0
    test_predictions_clipped = torch.clamp(test_predictions, 0.5, 5.0)


In [None]:
# Convert tensor to dataframe and save to csv for submission
submission_df = pd.DataFrame({'Id': test['Id'], 'rating': test_predictions_clipped.cpu().numpy()})
submission_df.to_csv('submission_custom_2.csv', index=False)
