In [1]:
import json
import pandas as pd

# personal ratings json file from letterboxd
with open('response_1743040116816.json', 'r', encoding='utf-8') as file:
    ratings_df = pd.DataFrame(json.load(file)['films'])
ratings_df['is_five_star'] = ratings_df['rating'].apply(lambda x: x=='★★★★★')
ratings_df

Unnamed: 0,film,rating,is_five_star
0,an-ostrich-told-me-the-world-is-fake-and-i-thi...,,False
1,gravity-2013,,False
2,black-narcissus,★★★★,False
3,avengers-infinity-war,,False
4,the-apartment,★★★★½,False
...,...,...,...
584,zathura-a-space-adventure,,False
585,the-tree-of-life-2011,★★★★½,False
586,inland-empire,★★★★★,True
587,drive-my-car,★★★★★,True


In [None]:
import requests

# url of api hosted on cloudrun
letterboxd_api_url = 'https://letterboxd-api-....us-east1.run.app/film/{film}/reviews'

def get_reviews(film):
  # print(letterboxd_api_url.format(film=film))
  reviews = requests.get(letterboxd_api_url.format(film=film), allow_redirects=True).text
  return reviews

async def get_reviews_async(df, film):
  async with aiohttp.ClientSession() as session:
    resp = await session.get(letterboxd_api_url.format(film=film), allow_redirects=True)
    df.loc[df['film'] == film, 'reviews'] = resp

# await asyncio.gather(*[get_reviews_async(ratings_df, film) for film in ratings_df['film']])
ratings_df['reviews'] = ratings_df['film'].apply(get_reviews)

In [3]:
ratings_df.to_csv('ratings-with-reviews.csv')

In [7]:
# saved to file and reloaded to prevent any data loss
ratings_df = pd.read_csv('ratings-with-reviews.csv')
ratings_df['response'] = ratings_df['reviews']

def get_reviews(x):
    try:
        return json.loads(x)['reviews']
    except:
        return None
    
ratings_df['reviews'] = ratings_df['reviews'].apply(get_reviews)

In [8]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [10]:
from transformers import LongformerTokenizer, LongformerModel

model_name = "allenai/longformer-base-4096" 
tokenizer = LongformerTokenizer.from_pretrained(model_name)
model = LongformerModel.from_pretrained(model_name).cuda()

def get_embeddings(reviews):
    input_ids = tokenizer('<sep>'.join(reviews[:10]), return_tensors="pt", padding=True).input_ids.cuda()
    with torch.no_grad():
      return model(input_ids).last_hidden_state
        
ratings_df['lf_embeddings'] = ratings_df['reviews'].dropna().apply(get_embeddings)

In [None]:
# alternate approach: encode each review as a sentence then take avg/max of the embedding element wise

In [None]:
!pip install -U sentence-transformers -q

from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('all-mpnet-base-v2')

In [33]:
def encode_reviews(reviews):
    return sentence_model.encode(reviews[:10])

ratings_df['embeddings'] = ratings_df['reviews'].dropna().apply(encode_reviews)

In [66]:
import numpy as np

ratings_df['avg_embedding'] = ratings_df['embeddings'].dropna().apply(lambda x: np.mean(x, axis=0))

In [None]:
ratings_df['embedding'] = ratings_df['avg_embedding'].dropna().apply(lambda x: np.array(x))
s = ratings_df['embedding'].apply(lambda x: not isinstance(x, float))
ratings_df = ratings_df[s]

In [19]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings_df[['lf_embeddings', 'is_five_star', 'film']], test_size=0.2)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MovieDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.embeddings = torch.tensor(df['lf_embeddings'].to_list()).float()
        def helper(val):
            return [1, 0] if val else [0, 1]
        self.ratings = torch.tensor(ratings_df['is_five_star'].apply(helper).to_list()).float()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.ratings[idx]

# Create dataset instances
train_dataset = MovieDataset(train)
test_dataset = MovieDataset(test)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
print(torch.tensor(ratings_df['embedding'].to_list()).float())
torch.tensor(ratings_df['is_five_star'].apply(lambda x: [x]).to_list()).float()

In [152]:
from torch import nn

class NeuralNetwork(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(embed_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )

    def forward(self, x):
        logits = self.layers(x)
        return logits

model = NeuralNetwork(sentence_model.get_sentence_embedding_dimension()).to(device)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=768, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=2, bias=True)
  )
)


In [153]:
import torch.optim as optim

# use weightings to 
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([20, 1]).cuda())
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [154]:
train_losses = []

num_epochs = 100
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        if torch.cuda.is_available():
            inputs, targets = inputs.cuda(), targets.cuda()
            model.cuda()
        else:
            model.cpu()
            
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'epoch [{epoch+1}/{num_epochs}], loss: {loss.item():.4f}')

epoch [1/100], loss: 1.0210
epoch [2/100], loss: 1.6913
epoch [3/100], loss: 1.6805
epoch [4/100], loss: 2.3304
epoch [5/100], loss: 1.3322
epoch [6/100], loss: 1.0013
epoch [7/100], loss: 0.9947
epoch [8/100], loss: 2.2623
epoch [9/100], loss: 0.6691
epoch [10/100], loss: 0.9742
epoch [11/100], loss: 0.6604
epoch [12/100], loss: 1.2614
epoch [13/100], loss: 1.2585
epoch [14/100], loss: 0.6493
epoch [15/100], loss: 0.6412
epoch [16/100], loss: 0.6347
epoch [17/100], loss: 0.6289
epoch [18/100], loss: 1.4501
epoch [19/100], loss: 1.1735
epoch [20/100], loss: 1.1395
epoch [21/100], loss: 0.6163
epoch [22/100], loss: 0.8597
epoch [23/100], loss: 0.8650
epoch [24/100], loss: 1.3509
epoch [25/100], loss: 1.8662
epoch [26/100], loss: 0.8409
epoch [27/100], loss: 1.3325
epoch [28/100], loss: 0.6060
epoch [29/100], loss: 0.8336
epoch [30/100], loss: 0.8319
epoch [31/100], loss: 1.5802
epoch [32/100], loss: 1.0709
epoch [33/100], loss: 1.3166
epoch [34/100], loss: 0.5884
epoch [35/100], loss: 1

In [None]:
with torch.no_grad():
    for inputs, targets in test_loader:
        if torch.cuda.is_available():
            inputs, targets = inputs.cuda(), targets.cuda()
            model.cuda()
        else:
            model.cpu()
            
        outputs = model(inputs)
        predicted = (outputs > 0.5).float()
        print(outputs)
        print(predicted)
        print(f'predicted: {predicted.item()}, actual: {targets.item()}')

findings

tried two main tasks both based on reviews left by other letterboxd users as input and my rating as the label:
1. linear regression to predict score, but the average prediction was equal to the actual average of my ratings (the model rated almost all films around 4.25, with very little variance)
2. since that didn't work out great, I decided to try binary classification, predicting whether I rated a movie five stars, using BCE with logits loss, weighted because there were many more non five star ratings than five star--still not great, the model struggled to converge anywhere

this project will need a lot more work before coming up with anything great, honestly it's tricky to tell what my five star ratinsg would be based on the reviews. I should find a task more suited toward the data, maybe move away from simple regression/classification problems and toward more specific recommender systems--would probably have more use for me anyway