In [1]:
import os,sys,gc

from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from glob import glob
import pickle

from datetime import timedelta

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

#Logging
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)



# Historyからitem2vec

In [3]:
#PATH
TRAIN_DIR = Path('/home/data/train_large')
TEST_DIR = Path('/home/data/ebnerd_testset')

trn_history = pl.read_parquet(TRAIN_DIR/'train'/'history.parquet')
val_history = pl.read_parquet(TRAIN_DIR/'validation'/'history.parquet')
test_history = pl.read_parquet(TEST_DIR/'test'/'history.parquet')

In [4]:
from gensim.models import Word2Vec

In [5]:
item_history = trn_history['article_id_fixed'].to_list()

In [6]:
def create_item2vec(item_history,vector_size):
    model = Word2Vec(sentences=item_history, 
                 vector_size=vector_size, 
                 window=5, 
                 min_count=1,
                 sg=0,
                 hs=0,
                 epochs=10,
                 workers=12)
    
    item_emb_matrix = np.zeros((len(model.wv.index_to_key), vector_size))
    for i, item in enumerate(model.wv.index_to_key):
        item_emb_matrix[i] = model.wv[item]

    vec_df = pl.from_numpy(item_emb_matrix).to_pandas()

    #col名をvector_0, vector_1, ...に変更
    vec_df.columns = ['vector_'+str(i) for i in range(vector_size)]    
    vec_df['article_id'] = model.wv.index_to_key

    #article_idをi32に変換
    vec_df['article_id'] = vec_df['article_id'].astype('int32')

    #vectorで始まるカラムをfloat32に変換
    for col in vec_df.columns:
        if 'vector' in col:
            vec_df[col] = vec_df[col].astype('float32')

    return vec_df

In [7]:
#item2vec_1

#trn
logger.info('Creating item2vec_1 for trn')

vec_df = create_item2vec(trn_history['article_id_fixed'].to_list(),64)

logger.info(f'shape of vec_df: {vec_df.shape}')

#save as parquet
vec_df.to_parquet('/home/data/item2vec_1/train_item2vec.parquet')

#val
logger.info('Creating item2vec_1 for val')
vec_df = create_item2vec(val_history['article_id_fixed'].to_list(),64)

logger.info(f'shape of vec_df: {vec_df.shape}')

#save as parquet
vec_df.to_parquet('/home/data/item2vec_1/valid_item2vec.parquet')

#test
logger.info('Creating item2vec_1 for test')
vec_df = create_item2vec(test_history['article_id_fixed'].to_list(),64)

logger.info(f'shape of vec_df: {vec_df.shape}')

#save as parquet
vec_df.to_parquet('/home/data/item2vec_1/test_item2vec.parquet')

[2024-04-12 08:46:33,678][INFO] Creating item2vec_1 for trn
[2024-04-12 08:58:42,366][INFO] shape of vec_df: (69140, 65)
[2024-04-12 08:58:42,848][INFO] Creating item2vec_1 for val
[2024-04-12 09:09:50,585][INFO] shape of vec_df: (67719, 65)
[2024-04-12 09:09:50,887][INFO] Creating item2vec_1 for test
[2024-04-12 09:21:19,908][INFO] shape of vec_df: (68497, 65)
