install packagees

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, LabelEncoder


prepare/process data
 - normalize numerical col
 - create one hot encoding
 - create feature matric

In [None]:
data = pd.read_csv("cleaned_data.csv")
user_data = pd.read_csv("user_data.csv")
#print(data.head())

# Convert 'startYear' to numeric if it's not already
data['startYear'] = pd.to_numeric(data['startYear'], errors='coerce')

# Handle missing values in numerical columns by filling with the mean or median
numerical_cols = ['startYear', 'averageRating', 'numVotes']
for col in numerical_cols:
    if data[col].isnull().sum() > 0:
        data[col].fillna(data[col].median(), inplace=True)
        #print(f"Filled missing values in {col} with median.")

# For categorical columns, fill missing values with a placeholder or mode
categorical_cols = ['titleType', 'genres', 'directorNames', 'writerNames', 'isAdult']
for col in categorical_cols:
    data[col].fillna('Unknown', inplace=True)
    #print(f"Filled missing values in {col} with 'Unknown'.")

data = data[data['startYear'] > 1990]
#print(data.head())
#print(user_data.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna('Unknown', inplace=True)


In [None]:
# movie data

# Initialize the scaler
scaler = MinMaxScaler()

# Select numerical columns
numerical_cols = ['startYear', 'averageRating', 'numVotes']

# Fit and transform the data
data = data.copy()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

print("Numerical columns normalized successfully!")
data[numerical_cols].head()

# One-Hot Encode 'titleType' and 'isAdult'
data = pd.get_dummies(data, columns=['titleType', 'isAdult'], prefix=['titleType', 'isAdult'])

# 0 and 1 assignment for titleType_movie, isAdult_0, isAdult_1
data[["titleType_movie", "isAdult_0", "isAdult_1"]] = data[["titleType_movie", "isAdult_0", "isAdult_1"]].astype(int)

# genre multi-encoding
data['genre_list'] = data['genres'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# mlb encoder
multi_label_encoder = MultiLabelBinarizer()
genres_encoded = multi_label_encoder.fit_transform(data['genre_list'])
genres_encoded_df = pd.DataFrame(genres_encoded, columns=multi_label_encoder.classes_)
data = pd.concat([data, genres_encoded_df], axis=1)


#print(user_data.head())

# director label encoding
label_encoder = LabelEncoder()
data['directorNames'] = label_encoder.fit_transform(data['directorNames'])

# writer label encoding
data['writerNames'] = label_encoder.fit_transform(data['writerNames'])

# movie_id using label encoding
data["primaryTitle"] = label_encoder.fit_transform(data["primaryTitle"])


data.head()
#print(data.columns)
#user_data.head()





Numerical columns normalized successfully!


Unnamed: 0,tconst,primaryTitle,startYear,genres,directorNames,writerNames,averageRating,numVotes,titleType_movie,isAdult_0,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
2595,tt0015414,86996,0.272727,[],74346,141434,0.488889,4e-06,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2685,tt0015724,38456,0.060606,"['Drama', 'Mystery', 'Romance']",31896,40258,0.588889,9e-06,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14035,tt0035423,81539,0.30303,"['Comedy', 'Fantasy', 'Romance']",44598,125169,0.6,0.030575,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15926,tt0038086,137074,0.060606,['Thriller'],39191,49947,0.666667,7e-06,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33106,tt0062336,167731,0.878788,['Drama'],86031,108933,0.611111,6.4e-05,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
#user_data

user_numerical_cols = ['UserRating']

user_data[user_numerical_cols] = scaler.fit_transform(user_data[user_numerical_cols])

user_data['user_genre_list'] = user_data['FavoriteGenres'].apply(lambda x: eval(x) if isinstance(x, str) else x)
user_genres_encoded = multi_label_encoder.fit_transform(user_data['user_genre_list'])
user_genres_encoded_df = pd.DataFrame(user_genres_encoded, columns=multi_label_encoder.classes_)
user_data = pd.concat([user_data, user_genres_encoded_df], axis=1)


# director label encoding
label_encoder = LabelEncoder()
user_data['FavoriteDirectors'] = label_encoder.fit_transform(user_data['FavoriteDirectors'])

# writer label encoding
user_data['FavoriteActors'] = label_encoder.fit_transform(user_data['FavoriteActors'])

# movie_id label encoding
user_data["primaryTitle"] = label_encoder.fit_transform(user_data["primaryTitle"])

# user_id label ecoding
user_data["UserID"] = label_encoder.fit_transform(user_data["UserID"])


user_data.head()
#print(user_data.columns)

Unnamed: 0,UserID,tconst,primaryTitle,UserRating,FavoriteGenres,FavoriteActors,FavoriteDirectors,user_genre_list,Action,Adult,...,Mystery,News,Reality-TV,Romance,Sci-Fi,Sport,Talk-Show,Thriller,War,Western
0,0,tt0000009,149138,0.0,['Romance'],149,19868,[Romance],0,0,...,0,0,0,1,0,0,0,0,0,0
1,111111,tt0000147,224595,0.444444,"['News', 'Documentary', 'Sport']",49,46569,"[News, Documentary, Sport]",0,0,...,0,1,0,0,0,1,0,0,0,0
2,222222,tt0000502,35350,0.888889,[],299,105924,[],0,0,...,0,0,0,0,0,0,0,0,0,0
3,252417,tt0000574,247271,0.777778,['Action'],249,33864,[Action],1,0,...,0,0,0,0,0,0,0,0,0,0
4,263528,tt0000591,242575,0.888889,['Drama'],144,89998,[Drama],0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
genre_list = [col for col in data.columns if col not in ['tconst', 'primaryTitle', 'startYear', 'genres', 'directorNames',
       'writerNames', 'averageRating', 'numVotes', 'titleType_movie',
       'isAdult_0', 'isAdult_1', 'genre_list']]

movie_features = pd.concat([data[['tconst', 'directorNames', 'averageRating', 'numVotes']]],axis=1)

movie_features['genres_list'] = data[genre_list].apply(
    lambda genres: [1 if genres[genre] == 1 else 0 for genre in genre_list], axis=1  # Access genre columns by name
)

# formats user features in an array
user_features = pd.concat([user_data[['tconst', 'UserID', 'UserRating']]],axis=1)

# formats movie feautes in an array
merged_data = pd.merge(user_features, movie_features, on='tconst', how='inner')



In [None]:
x_input = {}

for user_id, group in merged_data.groupby('UserID'):
    x_input[user_id] = [
        (
            # Movie features (e_m)
            list(row['genres_list']) + [row['directorNames'], row['averageRating'], row['numVotes']],
            # User rating (r_m)
            row['UserRating']
        )
        for _, row in group.iterrows()
    ]

userid = list(x_input.keys())[0]
print(x_input[userid])


[([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 50418, 0.5333333333333332, 0.0013447309168752065], 0.3333333333333333)]


create vae model design
 - encoder
 - decoder
 - define loss function

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils
import torch.distributions
import torchvision
import numpy as np

class Encoder(nn.Module):

    def __init__( self, input_dim, hidden_dim, latent_dim ):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)      # Mean of latent distribution
        self.fc_log_var = nn.Linear(hidden_dim, latent_dim) # Log variance of latent distribution

    def forward(self, x):
        h = F.relu(self.fc1(x))
        mu = self.fc_mu(h)
        log_var = self.fc_log_var(h)
        return mu, log_var  # Return mean and log variance

class Decoder(nn.Module):

    def __init__( self, input_dim, hidden_dim, latent_dim ):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, input_dim)

    def forward(self, z):
        h = F.relu(self.fc1(z))  # Apply ReLU activation
        # might want to change .softmax to .sigmoid
        return torch.sigmoid(self.fc2(z))  # Return class probabilities

class PartialVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(PartialVAE, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, latent_dim)
        self.decoder = Decoder(input_dim, hidden_dim, latent_dim)

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)  # Standard deviation
        epsilon = torch.randn_like(std)  # Random noise
        return mu + epsilon * std  # Reparameterization trick

    def forward(self, x, mask):
        x_observed = x * mask #apply mask to only consider observed data
        mu, log_var = self.encoder(x_observed)  # Encode input to latent distribution
        z = self.reparameterize(mu, log_var)  # Sample latent variable
        x_reconstructed = self.decoder(z)  # Decode back to input space
        return x_reconstructed, mu, log_var  # Return reconstructed data, mean, and log variance

def loss_function(x_reconstructed, x, mu, log_var, mask):
    # Masked reconstruction loss (only consider observed values)
    reconstruction_loss = F.binary_cross_entropy(
        x_reconstructed * mask, x * mask, reduction="sum"
    )
    # KL divergence for latent space regularization
    kl_divergence = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return reconstruction_loss + kl_divergence


Training VAE
 - train model
  - forward pass
  - compute loss
  - backpropogation
  - run for # of epochs

Generate Latent Representations
- evaluate user preferences
- compare with other movies

Content-Based Recomendations
 - Compute Similarities
 - generate reccomendations


Testing and Validations
 - Evaluate Reccomendations
 - Fine Tune Model