In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *
from fastai.column_data import *

  from numpy.core.umath_tests import inner1d


In [2]:
path = "../data/ml-20m/"
ratings = pd.read_csv(path + "ratings.csv")

In [3]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
ratings.shape

(20000263, 4)

In [6]:
print(f"full dataset:\nnumber users {int(ratings.userId.nunique())}, number movies {int(ratings.movieId.nunique())}")

full dataset:
number users 138493, number movies 26744


In [10]:
# 20m data points, validation set 5% will result in 1m validation datapoints
val_idxs = get_cv_idxs(n=len(ratings), val_pct=0.05)

In [11]:
u_uniq = ratings.userId.unique()
user2idx = {o:i for i,o in enumerate(u_uniq)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

m_uniq = ratings.movieId.unique()
movie2idx = {o:i for i,o in enumerate(m_uniq)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

In [12]:
n_users=int(ratings.userId.nunique())
n_movies=int(ratings.movieId.nunique())
n_factors = 100
print(f"number users: {n_users}, number movies {n_movies}")

number users: 138493, number movies 26744


In [13]:
min_rating,max_rating = ratings.rating.min(),ratings.rating.max()

In [14]:
x = ratings.drop(['rating', 'timestamp'],axis=1)
y = ratings['rating'].astype(np.float32)

In [15]:
data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], 64)

In [16]:
def get_emb(ni,nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(-0.01,0.01)
    return e

In [17]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        (self.u, self.m) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors)]]
        
        # layer 1 fully connected 150 units
        self.lin1 = nn.Linear(n_factors*2, 150)
        
        # layer 2 fully connected 50 units
        self.lin2 = nn.Linear(150, 50)
        
        # layer 3 fully connected 1 unit (output)
        self.lin3 = nn.Linear(50, 1)
        
        # dropouts
        self.drop1 = nn.Dropout(0)
        self.drop2 = nn.Dropout(0)
        self.drop3 = nn.Dropout(0)
        
    def forward(self, cats, conts):
        
        # extracting user and movie indices from ColumnarModelData object
        users,movies = cats[:,0],cats[:,1]
        
        # extract input vector from embedding matrices
        x = torch.cat([self.u(users),self.m(movies)], dim=1)
        
        # perform dropout on input vector embeddings
        x = self.drop1(x)
        
        # first hidden layer
        x = self.drop2(F.relu(self.lin1(x)))
        
        # second hidden layer
        x = self.drop3(F.relu(self.lin2(x)))
        
        # output
        x = F.sigmoid(self.lin3(x)) * (max_rating-min_rating+1) + min_rating-0.5
        
        return x  

In [18]:
wd=1e-5
model = EmbeddingNet(n_users, n_movies).cuda()
opt = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0)

In [19]:
set_lrs(opt, 1e-3)

In [20]:
fit(model, data, 3, opt, F.mse_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                       
    0      0.672815   0.672685  
    1      0.640712   0.647267                                       
    2      0.610662   0.63239                                        



[array([0.63239])]

In [33]:
user_embeddings = pd.DataFrame(model.u.weight.data.cpu().numpy())
user_embeddings.to_pickle("../data/collaborative_embeddings/user_embeddings_1.pkl")

In [34]:
movie_embeddings = pd.DataFrame(model.m.weight.data.cpu().numpy())
movie_embeddings.to_pickle("../data/collaborative_embeddings/movie_embeddings_1.pkl")