In [1]:
import os
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import pyarrow.parquet as pq
import pyarrow as pa
import scipy
import implicit
from sklearn.decomposition import TruncatedSVD
from gensim import corpora
from gensim.models import LdaMulticore

warnings.filterwarnings('ignore')

In [2]:
RANDOM_SEED = 42
DATA_FOLDER = 'competition_data_final_pqt'

### ALS

In [6]:
all_data_agg = []
for file in tqdm(os.listdir(DATA_FOLDER)):
    data = pq.read_table(f'{DATA_FOLDER}/{file}')
    data_agg = data.select(['user_id', 'url_host', 'request_cnt']).\
        group_by(['user_id', 'url_host']).aggregate([('request_cnt', "sum")])
    all_data_agg.append(data_agg)

all_data_agg = pa.concat_tables(all_data_agg)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:29<00:00,  2.96s/it]


In [7]:
url_set = set(all_data_agg.select(['url_host']).to_pandas()['url_host'])
print(f'{len(url_set)} urls')
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}
usr_set = set(all_data_agg.select(['user_id']).to_pandas()['user_id'])
print(f'{len(usr_set)} users')
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}
inv_usr_map = {v: k for k, v in usr_dict.items()}

199683 urls
415317 users


In [8]:
values = np.array(all_data_agg.select(['request_cnt_sum']).to_pandas()['request_cnt_sum'])
rows = np.array(all_data_agg.select(['user_id']).to_pandas()['user_id'].map(usr_dict))
cols = np.array(all_data_agg.select(['url_host']).to_pandas()['url_host'].map(url_dict))
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

In [None]:
scr_mat = mat.tocsr()
url_data = [scr_mat[i].nonzero()[1].tolist() for i in range(scr_mat.shape[0])]

In [None]:
als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = 100, \
      iterations = 30, use_gpu = False, calculate_training_loss = False, regularization = 0.1)

als.fit(mat)

In [None]:
u_factors = als.model.user_factors 
w_factors = als.model.item_factors

In [None]:
als_usr_emb = []
for urls in url_data:
    vectors = w_factors[urls]
    als_usr_emb.append(np.mean(vectors, axis=0))

In [None]:
als_embs_df = pd.DataFrame(als_usr_emb)
als_embs_df['user_id'] = als_embs_df.index.map(inv_usr_map)

als_embs_df.to_csv('embeddings/als_embeddings.csv', index=False)

### SVD

In [6]:
mat = mat.astype('float').tocsr()

In [7]:
svd = TruncatedSVD(n_components=600, algorithm='arpack')
url_embeddings = svd.fit_transform(mat.T)

In [8]:
url_data = [mat[i].nonzero()[1].tolist() for i in range(mat.shape[0])]

In [9]:
all_usr_emb = []
for urls in url_data:
    vectors = url_embeddings[urls]
    all_usr_emb.append(np.mean(vectors, axis=0))

svd_embs = pd.DataFrame(all_usr_emb)
svd_embs['user_id'] = svd_embs.index.map(inv_usr_map)

In [11]:
svd_embs.to_csv('embeddings/svd_embeddings.csv', index=False)

### LDA

In [11]:
all_data_agg = all_data_agg.to_pandas()

user2urls = defaultdict(list)
for row in tqdm(all_data_agg.itertuples(index=False)):
    user2urls[row.user_id].extend([row.url_host] * row.request_cnt_sum)

user_urls = list(user2urls.values())

32277669it [02:50, 189809.65it/s]


In [13]:
dictionary = corpora.Dictionary(user_urls)
corpus = [dictionary.doc2bow(doc) for doc in user_urls]

In [14]:
lda = LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=50,
    workers=8,
    passes=20,
    eval_every=None
)

In [21]:
topics = lda.get_topics()

all_user_emb = []
for sites in user_urls:
    vectors = [topics[:, dictionary.token2id[url]] for url in sites]
    user_emb = np.mean(vectors, axis=0)
    if isinstance(user_emb, np.float64):
        all_user_emb.append(np.zeros((1, 50)))
    else:
        all_user_emb.append(user_emb)

In [25]:
lda_embs = pd.DataFrame(all_user_emb)
lda_embs['user_id'] = list(user2urls.keys())

lda_embs.to_csv('embeddings/lda_embeddings.csv', index=False)