In [6]:
import numpy as np
import pandas as pd
import torch

from data_reader import read_interaction_matrix as interactions, read_df

from datasets import Dataset


if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'


R = interactions()
df = read_df('train')
df.head()

ds = Dataset.from_pandas(df)


In [1]:
# !huggingface-cli login

# ds.push_to_hub('aeirya/irhw3', private=True)

# from transformers import HfApi
# api = HfApi()

In [186]:
users = list(set(df['user_id']))
items = list(set(df['item_id']))

n_users = len(users)
n_items = len(items)

user2idx = { x:i for i,x in enumerate(users) }
item2idx = { x:i for i,x in enumerate(items) }

In [9]:
# for item in items:
#     reviews = df[df['item_id'] == item]['review_text']

In [187]:
import torch
from transformers import AutoTokenizer, AutoModel

model_name = 'bert-base-uncased'
# model_name = 'distilbert-base-uncased'
# model_name = 'distilroberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [49]:
df = df.dropna()

tok_no_pad = tokenizer(df['review_text'].tolist(), padding=False, truncation=True)
n = [len(e.ids) for e in tok_no_pad[:]]
pd.Series(n).describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9])

count    23037.000000
mean       163.899727
std        112.273022
min          5.000000
50%        134.000000
60%        159.000000
70%        191.000000
80%        237.000000
90%        323.000000
max        512.000000
dtype: float64

In [51]:
def tok(sentences, max_length=200):
    return tokenizer(
        sentences,
        add_special_tokens=True,
        return_tensors="pt",
        return_attention_mask=True,
        padding=True,
        truncation=True,
        max_length=max_length,
        # pad_to_max_length=True
    )

model.eval()
if torch.device
model.to('')
def bert(texts, tokenized=True):
    tok_input = tok(texts) if not tokenized else texts
    
    with torch.no_grad():
        output = model(**tok_input).last_hidden_state
    
    return output


def sent2vec(output, alpha=0.2):
    '''
    aggregate tokens in every sentence together
    
    @param output: (n_sentences x n_tokens x n_hidden_size)
    '''
    
    M = output.max(axis=1).values
    mean = output.mean(axis=1)
    return alpha * M + (1-alpha) * mean

In [57]:
from datasets import Dataset

DF = df
DF['n_tok'] = n
DF = DF.dropna().sort_values(by='n_tok')[['item_id', 'review_text', 'n_tok']].iloc[:2000]
ds = Dataset.from_pandas(DF.reset_index())

In [72]:
def tokenize_ds(ds):
    # return tok(ds)
    return tok(ds['review_text'])

# tok_ds = ds.map(
#     tokenize_ds, 
#     batched=True, 
#     batch_size=10000, 
#     input_columns='review_text',
#     remove_columns='review_text',
#     keep_in_memory=True,
#     desc='Tokenizing'
#     ).sort('item_id')

# tok_ds.set_format('torch')

In [65]:
from torch.utils.data import DataLoader, RandomSampler

# sampler = RandomSampler(ds, num_samples=1000)
dataloader = DataLoader(ds, batch_size=64, shuffle=False, 
num_workers=0
# , sampler=sampler
)

from tqdm import tqdm

bert_keys = [
    'input_ids', 'attention_mask'
            #  , 'token_type_ids'
             ]

iterator = tqdm(dataloader)

tok_sen = {k:[] for k in items}

for batch in iterator:
    # input = { key: batch[key] for key in bert_keys }
    input = tokenize_ds(batch)
    out = bert(input)

100%|██████████| 32/32 [00:25<00:00,  1.27it/s]


In [117]:
def be(ds):
    input = tokenize_ds(ds)
    out = bert(input)
    return {'out': out}

enc_batch_size = 20
enc_ds = ds.map(be, batched=True, batch_size=enc_batch_size)
enc_ds.set_format('torch')
# sed = enc_ds.sort(['item_id', 'n_tok'])

In [124]:
groups = enc_ds.to_pandas().groupby('item_id')

for item, group in groups:
    out = torch.tensor(group['out'])
    print(out.shape)
    break

ValueError: could not determine the shape of object type 'Series'

In [180]:
from datasets import Dataset

enc = torch.zeros((len(items), 768))

DF = df.dropna()[['item_id', 'review_text']].sort_values('item_id').iloc[0:2000].reset_index()
input_ds = Dataset.from_pandas(DF[['review_text']]).map(tok, batched=True, input_columns='review_text', remove_columns=['review_text'])
input_ds.set_format('torch')

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [181]:
input_ds

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2000
})

In [182]:
for item, group in tqdm(DF.groupby(by='item_id', dropna=True)):
    # out = bert(group['review_text'].tolist(), tokenized=False)
    out = bert(input_ds[group.index])
    v = sent2vec(out).mean(axis=0)
    print(out.shape, v.shape)
    enc[item2idx[item], :] = v


  2%|▏         | 1/62 [00:00<00:49,  1.23it/s]

torch.Size([8, 200, 768]) torch.Size([768])


  3%|▎         | 2/62 [00:01<00:44,  1.34it/s]

torch.Size([9, 200, 768]) torch.Size([768])


  5%|▍         | 3/62 [00:02<01:01,  1.05s/it]

torch.Size([21, 200, 768]) torch.Size([768])


  6%|▋         | 4/62 [00:10<03:22,  3.48s/it]

torch.Size([95, 200, 768]) torch.Size([768])


  8%|▊         | 5/62 [00:15<03:48,  4.01s/it]

torch.Size([65, 200, 768]) torch.Size([768])


 10%|▉         | 6/62 [00:17<03:06,  3.33s/it]

torch.Size([30, 200, 768]) torch.Size([768])


 11%|█▏        | 7/62 [00:20<03:06,  3.40s/it]

torch.Size([52, 200, 768]) torch.Size([768])


 13%|█▎        | 8/62 [00:22<02:36,  2.90s/it]

torch.Size([26, 200, 768]) torch.Size([768])


 15%|█▍        | 9/62 [00:24<02:17,  2.59s/it]

torch.Size([29, 200, 768]) torch.Size([768])


 15%|█▍        | 9/62 [00:25<02:27,  2.78s/it]


KeyboardInterrupt: 

In [171]:
input_ds[group.index]

{'input_ids': tensor([[  101,  2383,  2179,  ...,     0,     0,     0],
         [  101,  2004,  1037,  ...,     0,     0,     0],
         [  101,  2054,  1037,  ...,     0,     0,     0],
         ...,
         [  101,  1996, 15212,  ...,  2553,  1012,   102],
         [  101,  1045,  2288,  ...,  2034,  2154,   102],
         [  101,  1045,  2031,  ...,  2075,  2007,   102]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]])}

In [47]:
h_size = 768
review_freq = np.zeros(n_items)
token_avg = torch.zeros((n_items, h_size))

In [49]:
from tqdm import tqdm

for batch in tqdm(dataloader):
    ids = batch['item_id']
    print(ids)
    break

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  0%|          | 0/3 [00:00<?, ?it/s]


KeyError: 0

In [200]:
enc = torch.zeros((len(items), 768))

for (item, data) in DF.groupby('item_id')['review_text']:
    inputs = [tok_reviews[i] for i in data.index]
    for input in inputs:
        out = bert(**input)
        break
    i = item2idx[item]
    enc[i, :] = sent2vec(out).mean(axis=0)

TypeError: __main__.bert() argument after ** must be a mapping, not tokenizers.Encoding

In [None]:
from sklearn.decomposition import PCA

res = PCA(64).fit_transform(a.T)

ValueError: n_components=64 must be between 0 and min(n_samples, n_features)=30 with svd_solver='full'