In [3]:
import pandas as pd
import numpy as np
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

In [27]:
from transformers import AutoModel, AutoTokenizer 
model_name = "google/bert_uncased_L-4_H-256_A-4" 
bert = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at google/bert_uncased_L-4_H-256_A-4 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
df = pd.read_csv('movieLens_train.csv')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,occupation,zipcode,movie_title,sex,title_description,agegroup
0,196,242,3,881250949,49,writer,55105,Kolya (1996),male,A confirmed bachelor is in for the surprise of...,mid-adult
1,305,242,5,886307828,23,programmer,94086,Kolya (1996),male,A confirmed bachelor is in for the surprise of...,young-adult
2,6,242,4,883268170,42,executive,98101,Kolya (1996),male,A confirmed bachelor is in for the surprise of...,mid-adult
3,234,242,4,891033261,60,retired,94702,Kolya (1996),male,A confirmed bachelor is in for the surprise of...,senior
4,63,242,3,875747190,31,marketing,75240,Kolya (1996),male,A confirmed bachelor is in for the surprise of...,adult


In [11]:
df = df[['user_id', 'movie_id', 'rating', 'title_description']]
df['title_description'].fillna('', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [12]:
df['rating_binary'] = (df['rating'] >= 4).astype(int)

In [49]:
df.head()

Unnamed: 0,user_id,movie_id,rating,title_description,rating_binary
0,196,242,3,A confirmed bachelor is in for the surprise of...,0
1,305,242,5,A confirmed bachelor is in for the surprise of...,1
2,6,242,4,A confirmed bachelor is in for the surprise of...,1
3,234,242,4,A confirmed bachelor is in for the surprise of...,1
4,63,242,3,A confirmed bachelor is in for the surprise of...,0


In [13]:
df_train, df_val = train_test_split(df, test_size=0.25)

In [14]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=20):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        self.sig = nn.Sigmoid()
        self.nonlin = nn.ReLU()
        self.drop = nn.Dropout(p = 0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        U = self.nonlin(U)
        V = self.item_emb(v)
        V = self.drop(V)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return self.sig((U*V).sum(1) +  b_u  + b_v)

In [42]:
def train_one_epoch(model, train_df, optimizer):
    model.train()
    y = torch.FloatTensor(train_df.rating_binary.values)
    u = torch.LongTensor(train_df.user_id.values)
    v = torch.LongTensor(train_df.movie_id.values)
    y_hat = model(u,v)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    train_acc = accuracy_score(output,y)
    train_loss = F.binary_cross_entropy(y_hat, y)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    return train_loss.item(), train_acc

def valid_metrics(model, valid_df):
    model.eval()
    u = torch.LongTensor(valid_df.user_id.values)
    v = torch.LongTensor(valid_df.movie_id.values)
    y = torch.FloatTensor(valid_df.rating_binary.values)
    y_hat = model(u,v)
    valid_loss = F.binary_cross_entropy(y_hat, y)
    output = torch.as_tensor(y_hat > 0.5, dtype = torch.int8)
    valid_acc = accuracy_score(output,y)
    return valid_loss.item(), valid_acc



def MF_training(model, df_train, df_val, epochs=10, lr=0.01, wd=0.0, evaluate=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in tqdm(range(epochs)):
        train_loss, train_acc = train_one_epoch(model, df_train, optimizer)
        valid_loss, valid_acc = valid_metrics(model, df_val) 
        if i%10== 0:
            print("train loss %.3f train acc %.3f valid loss %.3f valid acc %.3f" % (train_loss,train_acc,valid_loss, valid_acc))

def training(model, train_loader, valid_loader, epochs=10, lr=0.01, wd=0.0, num_points_val=len(df_val)):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    lossFun = nn.BCEWithLogitsLoss()
    train_losses = []
    test_losses = []
    test_accs = []
    for i in range(epochs):
        train_loss = one_pass(model, train_loader, optimizer, lossFun, backwards=False)
        train_losses.append(train_loss)
        test_acc, test_loss = one_pass_acc(model, valid_loader, lossFun, num_points_val)
        test_losses.append(test_loss)
        test_accs.append(test_acc)
        if i%10== 0:
            print("train loss %.3f test loss %.3f test acc %.3f" % (train_loss,test_loss, test_acc))
    return train_losses, test_losses, test_accs


def one_pass(model, dataloader, optimizer, lossFun, backwards=True):
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for inputs in dataloader:
        y = inputs[-1]
        y_pred = model(*inputs[:-1])
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

def one_pass_acc(model, dataloader, lossFun, num_points):
    model.eval()
    total_incorrect = 0
    losses = []
    
    for inputs in dataloader:
        y = inputs[-1]
        y_pred = model(*inputs[:-1])
        valid_loss = lossFun(y_pred, y).item()
        losses.append(valid_loss)
        y_pred = (y_pred > 0.5).float()
        
        total_incorrect += torch.count_nonzero(y - y_pred).item()
    
    percent_wrong = total_incorrect / num_points
    
    acc = 1 - percent_wrong
    return acc, np.mean(losses)

def make_tokens_list(id_to_text, model, tokenizer):
    tokens = {}
    for k in tqdm(id_to_text):
        v = id_to_text[k]
        inputs = tokenizer(v, return_tensors="pt")
        outputs = model(**inputs, output_hidden_states=True)
        tokens[k] = outputs.hidden_states[-1][0,0,:]
    return tokens

In [26]:
MF_model = MF(df.user_id.max()+1, df.movie_id.max()+1, emb_size=100)
MF_training(MF_model, df_train, df_val, epochs=50, lr=.01, wd=0)

  0%|          | 0/50 [00:00<?, ?it/s]

train loss 0.690 train acc 0.556 valid loss 0.681 valid acc 0.572
train loss 0.588 train acc 0.713 valid loss 0.615 valid acc 0.688
train loss 0.550 train acc 0.742 valid loss 0.601 valid acc 0.695
train loss 0.505 train acc 0.777 valid loss 0.593 valid acc 0.699
train loss 0.439 train acc 0.822 valid loss 0.592 valid acc 0.703


In [18]:
movie_title_df = pd.DataFrame(df.groupby('movie_id')['title_description'].apply(list).apply(lambda x: x[0])\
                                                                                           .fillna('')\
                                                                                           .apply(lambda x: x[:256]))

In [19]:
idx_text = zip(movie_title_df.index.to_list(), movie_title_df['title_description'].fillna('').to_list())

In [20]:
id_to_text = {x[0]:x[1] for x in idx_text}

In [47]:
id_to_text[1]

'A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boys room.::Kenneth Chisholm A little boy named Andy loves to be in his room, playing with his toys, especially his doll named "Woody". But, what do'

In [35]:
class NN(nn.Module):
    def __init__(self, num_users, num_items, emb_user_size=20, 
                 emb_item_size=20, bert_emb_size=256, hidden_layer_size=64):
        super(NN, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_user_size)
        self.item_emb = nn.Embedding(num_items, emb_item_size)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.sig = nn.Sigmoid()
        self.linear1 = nn.Linear(emb_user_size+emb_item_size+bert_emb_size, hidden_layer_size)
        self.linear2 = nn.Linear(emb_user_size+emb_item_size+bert_emb_size, hidden_layer_size)
        self.linear3 = nn.Linear(64, 16)
        self.linear4 = nn.Linear(16,1)
        self.rel = nn.ReLU()

    def forward(self, u, v, cls):
        U = self.user_emb(u)
        V = self.item_emb(v)
        ensemble = torch.cat((U,V,cls),dim=1)
        pred = self.linear1(ensemble)
        pred = self.rel(pred)
        pred = self.linear2(ensemble)
        pred = self.rel(pred)
        pred = self.linear3(pred)
        pred = self.rel(pred)
        pred = self.linear4(pred)
        return self.sig(pred.squeeze())

In [28]:
token_dict = make_tokens_list(id_to_text, bert, tokenizer)

  0%|          | 0/1682 [00:00<?, ?it/s]

In [48]:
token_dict[1]

tensor([-1.1425, -0.6593,  1.5027,  0.8974, -0.7918, -0.1748, -1.7075, -1.5581,
        -2.9510, -0.7463, -0.2770, -0.0983,  1.9574,  0.1135, -0.1602, -0.6071,
         0.1358, -0.2777, -0.5894,  0.7119,  1.0803,  0.4091, -0.9735,  1.2420,
         0.5479,  0.1506,  1.6044, -0.8378,  0.2299,  0.8203, -0.1484,  0.4122,
         1.3269,  0.3511,  0.4023, -1.3135, -0.1423, -0.0306,  0.1164, -1.1530,
         0.1273,  0.4547,  1.7098, -0.7452,  0.1013,  0.3143, -0.8149, -0.6747,
         0.2637,  1.0025,  0.9711,  1.5056,  0.0115,  1.1679, -0.9042, -0.8679,
        -0.6701,  1.2456,  0.0174, -0.1229,  0.9673, -0.8733, -0.0580, -1.0569,
        -0.0248,  0.3868,  0.1275, -1.3822, -0.7211,  0.8551, -0.0082, -0.0286,
         1.2046,  0.5487,  1.0417, -0.3140, -0.0652, -0.5354, -0.1423,  1.3656,
         0.6835,  0.2675,  0.3130, -1.4495, -0.0594,  0.4048,  1.7734,  0.1812,
        -0.7780,  0.5523, -0.6006, -0.2637,  0.3812,  0.6064, -0.7656,  0.4696,
         1.5826, -0.6075, -0.0207,  0.91

In [29]:
class MovieDataset(Dataset):
    def __init__(self, df, token_dict):
        self.df = df
        self.token_dict = token_dict
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        product_idx = row['movie_id']
        user_idx = row['user_id']
        cls_token = self.token_dict[product_idx]
        
        u = torch.tensor(user_idx)
        v = torch.tensor(product_idx)
                
        y = torch.tensor(row['rating_binary']).float()
        

        return u, v, cls_token, y

In [30]:
train_ds = MovieDataset(df_train, token_dict)
val_ds = MovieDataset(df_val, token_dict)
df_train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
df_val_dl = DataLoader(val_ds, batch_size=256, shuffle=False)

In [36]:
NNh = NN(df.user_id.max()+1, df.movie_id.max()+1, 50, 50, 256)

In [46]:
trainL, testL, testA = training(NNh, df_train_dl, df_val_dl, epochs = 51)

train loss 0.664 test loss 0.721 test acc 0.621
train loss 0.580 test loss 0.681 test acc 0.696
train loss 0.564 test loss 0.622 test acc 0.722
train loss 0.491 test loss 0.572 test acc 0.723
train loss 0.422 test loss 0.523 test acc 0.738
train loss 0.380 test loss 0.525 test acc 0.722
