In [2]:
from google.colab import drive
import os

drive.mount('/content/gdrive')
%cd gdrive/MyDrive/wb_competition/

Mounted at /content/gdrive
/content/gdrive/MyDrive/wb_competition


## Metrics

In [29]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [20]:
os.environ['OPENBLAS_NUM_THREADS'] ='1'
!pip install implicit==0.5.1

Collecting implicit==0.5.1
  Downloading implicit-0.5.1.tar.gz (71 kB)
[?25l[K     |████▋                           | 10 kB 33.9 MB/s eta 0:00:01[K     |█████████▏                      | 20 kB 39.8 MB/s eta 0:00:01[K     |█████████████▊                  | 30 kB 42.9 MB/s eta 0:00:01[K     |██████████████████▎             | 40 kB 27.0 MB/s eta 0:00:01[K     |███████████████████████         | 51 kB 11.4 MB/s eta 0:00:01[K     |███████████████████████████▌    | 61 kB 10.6 MB/s eta 0:00:01[K     |████████████████████████████████| 71 kB 6.4 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (PEP 517) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.5.1-cp37-cp37m-linux_x86_64.whl size=868482 sha256=724

In [23]:
import json
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares

# DIR = '/kaggle/input'

In [4]:
def split_str(row):
    return row.split()


def get_ids_dict(data_train, data_test):
    features_df = list(data_train['Data'].apply(split_str).explode().unique())
    targets_df = list(data_train['Target'].apply(split_str).explode().unique())
    test_voc_df = list(data_test['Data'].apply(split_str).explode().unique())
    train_vocab = set(
        features_df
        + targets_df
        + test_voc_df
    )
    num_to_id = dict(enumerate(train_vocab))
    id_to_num = dict(zip(num_to_id.values(), num_to_id.keys()))
    return num_to_id, id_to_num


DIR = 'data'


for dirname, _, filenames in os.walk(DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/test.csv
data/train.csv


In [11]:
data_train = pd.read_csv(os.path.join(dirname, 'train.csv'), index_col=0)
data_test = pd.read_csv(os.path.join(dirname, 'test.csv'), index_col=0)


In [15]:
num_to_id, id_to_num = get_ids_dict(data_train, data_test)

user_c = {id:i for i, id in enumerate(list(data_train.index.unique()))}
data_train.index = data_train.index.map(user_c)
user_c_test = {id:i for i, id in enumerate(list(data_test.index.unique()))}
data_test.index = data_test.index.map(user_c_test)

In [17]:
y = data_train['Target'].apply(split_str).explode()
y = y.map(id_to_num)
y = y.reset_index(level=0)

data_train = data_train['Data'].apply(split_str).explode()
data_train = data_train.map(id_to_num)
data_train = data_train.reset_index(level=0)
data_train['Value'] = 1

data_test = data_test['Data'].apply(split_str).explode()
data_test = data_test.map(id_to_num)
data_test = data_test.reset_index(level=0)
data_test['Value'] = 1

In [24]:
def create_user_item_matrix(intersect_df, len_items, len_user=200000):
    row = intersect_df['Id']
    col = intersect_df['Data']
    sparse_matrix = csr_matrix(
        (intersect_df['Value'].astype(np.float16), (row, col)), 
        shape=(len_user, len_items), 
        dtype=np.float16)
    return sparse_matrix

train_matrix = create_user_item_matrix(data_train, len(list(id_to_num.values())))
test_matrix = create_user_item_matrix(data_test, len(list(id_to_num.values())))

# Implicit

In [25]:
train_matrix = bm25_weight(train_matrix, K1=100, B=0.8)
train_matrix = train_matrix.tocsr()

## ALS

In [26]:
model = AlternatingLeastSquares(factors=96, regularization=0.05)
model.fit(train_matrix)

  0%|          | 0/15 [00:00<?, ?it/s]

In [132]:
userid = 0
ids, scores = model.recommend(userid, train_matrix[userid], N=10, filter_already_liked_items=True, recalculate_user=True)
for i, id in enumerate(ids):
    print(id, scores[i])

151191 0.085972846
43095 0.05312638
392598 0.047015306
556172 0.04327568
546946 0.043003827
903489 0.04284439
989678 0.04257977
631128 0.038731944
172486 0.038197756
613039 0.037745982


In [153]:
id = list(user_c.keys())[0]
list(y.loc[y['Id'] == id, 'Target'])

[418352, 23289, 1082643, 753128, 1040991, 959350]

In [27]:
userids = np.arange(200000)
ids, scores = model.recommend(userids, train_matrix[userids], N=10, filter_already_liked_items=True)
ids = ids.tolist()
y_all = y.loc[y['Id'].isin(userids)].groupby('Id')['Target'].apply(list).tolist()

In [30]:
mapk(y_all, ids)

0.12294867071365582

In [34]:
test_matrix = bm25_weight(train_matrix, K1=100, B=0.8)
test_matrix = test_matrix.tocsr()

In [35]:
userids = np.arange(200000)
ids, scores = model.recommend(userids, test_matrix[userids], N=10, filter_already_liked_items=True)

## LMF

In [43]:
train_matrix = train_matrix.astype(np.float16)

In [44]:
from implicit.lmf import LogisticMatrixFactorization

model_lmf = LogisticMatrixFactorization(factors=64, regularization=0.5)
model_lmf.fit(train_matrix)

  0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
userids = np.arange(200000)
ids, scores = model.recommend(userids, train_matrix[userids], N=10, filter_already_liked_items=True)
ids = ids.tolist()
y_all = y.loc[y['Id'].isin(userids)].groupby('Id')['Target'].apply(list).tolist()

## Save results

In [38]:
# ids = ids.tolist()
ids = [' '.join([str(id) for id in row]) for row in ids]

In [39]:
data_test = pd.read_csv(os.path.join(dirname, 'test.csv'), index_col=0)
data_test = data_test.reset_index(level=0)
submission = pd.DataFrame({'Id':data_test['Id'], 'Predicted':ids})
submission.to_csv('data/' + "submission.csv", index=False)

In [40]:
submission

Unnamed: 0,Id,Predicted
0,1,115766 583212 300602 618281 117292 1101952 611...
1,3,473211 182526 832195 380985 1132601 900260 998...
2,25,123599 396039 310984 36608 1048069 1120487 196...
3,28,300602 1050500 117292 1022685 583212 804127 73...
4,32,393166 736129 344162 973908 996043 555292 3265...
...,...,...
199995,2024291,234743 349780 1056403 1098609 907471 81954 814...
199996,2024294,174939 836129 903378 607095 393287 104197 9825...
199997,2024305,930718 736129 266636 632279 163592 82475 91560...
199998,2024326,172930 786151 409402 930763 29423 187489 70438...


# MXNET

In [None]:
from mxnet import gluon, init, np, npx
from mxnet.gluon import nn
from d2l import mxnet as d2l

In [71]:
batch_size = 2000
train_iter = gluon.data.DataLoader(
    data_train, shuffle=True, last_batch='rollover', batch_size=batch_size,
    num_workers=d2l.get_dataloader_workers())
test_iter = gluon.data.DataLoader(
    data_test, shuffle=False, last_batch='rollover', batch_size=batch_size,
    num_workers=d2l.get_dataloader_workers())

In [21]:
class FM(nn.Block):
    def __init__(self, field_dims, num_factors):
        super(FM, self).__init__()
        num_inputs = int(sum(field_dims))
        self.embedding = nn.Embedding(num_inputs, num_factors)
        self.fc = nn.Embedding(num_inputs, 1)
        self.linear_layer = nn.Dense(1, use_bias=True)

    def forward(self, x):
        square_of_sum = np.sum(self.embedding(x), axis=1) ** 2
        sum_of_square = np.sum(self.embedding(x) ** 2, axis=1)
        x = self.linear_layer(self.fc(x).sum(1)) \
            + 0.5 * (square_of_sum - sum_of_square).sum(1, keepdims=True)
        x = npx.sigmoid(x)
        return x

In [76]:
np.zeros(1137138, dtype=np.int64)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [73]:
data_train.field_dims = 1137138

In [None]:
devices = d2l.try_all_gpus()
net = FM(data_train.field_dims, num_factors=50)
net.initialize(init.Xavier(), ctx=devices)
lr, num_epochs, optimizer = 0.02, 30, 'adam'
trainer = gluon.Trainer(net.collect_params(), optimizer,
                        {'learning_rate': lr})
loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
d2l.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices)