In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import math
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
from tqdm import tqdm
from copy import deepcopy
import warnings
from gensim.models import Word2Vec
warnings.filterwarnings("ignore")

path = '/root/kdd_cup_2020'
train_path = path + '/underexpose_train/local_2'  
test_path =  path + '/underexpose_test/local_2'


def get_all_click(now_phase=6):
    whole_all_click = pd.DataFrame()
    for phase in range(now_phase + 1):
#         print('phase: ', phase)

        train_click = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(phase), names=['user_id', 'item_id', 'time'])
        test_click = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(phase), names=['user_id', 'item_id', 'time'])

        all_click = train_click.append(test_click)
        whole_all_click = whole_all_click.append(all_click)
        whole_all_click = whole_all_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last')
#         print(len(whole_all_click))

    return whole_all_click



def local_constr(phase, now_phase):
    item_feat = pd.read_csv('/root/kdd_cup_2020/KDD_online_LB/data/item_feat_normalize.csv')
    item_feat_dict = item_embedding_dict(item_feat)
    all_click = get_all_click(now_phase = now_phase)
    print(all_click.shape)
    all_click = all_click.drop_duplicates(subset=['user_id', 'item_id', 'time'], keep='last')
    all_click = all_click.sort_values(['user_id', 'time']).reset_index(drop=True)  
    print(all_click.shape)
    return item_feat, item_feat_dict, all_click

def item_embedding_dict(df):
    emb_feats = list(df.columns)[1:]
    item_emb_dict = dict(zip(df['item_id'], df[emb_feats].values))
    return item_emb_dict


# fill user to 50 items
def get_predict(df, pred_col, top_fill):
    top_fill = [int(t) for t in top_fill.split(',')]
    scores = [-1 * i for i in range(1, len(top_fill) + 1)]
    ids = list(df['user_id'].unique())
    fill_df = pd.DataFrame(ids * len(top_fill), columns=['user_id'])
    fill_df.sort_values('user_id', inplace=True)
    fill_df['item_id'] = top_fill * len(ids)
    fill_df[pred_col] = scores * len(ids)
    df = df.append(fill_df)
    df.sort_values(pred_col, ascending=False, inplace=True)
    df = df.drop_duplicates(subset=['user_id', 'item_id'], keep='first')
    df['rank'] = df.groupby('user_id')[pred_col].rank(method='first', ascending=False)
    df = df[df['rank'] <= 50]
    df = df.groupby('user_id')['item_id'].apply(lambda x: ','.join([str(i) for i in x])).str.split(',',
                                                                                                   expand=True).reset_index()
    return df





In [2]:
# coding=utf-8
from __future__ import division
from __future__ import print_function

import datetime
import json
import sys
import time
from collections import defaultdict

import numpy as np

# the higher scores, the better performance
def evaluate_each_phase(predictions, answers):
    list_item_degress = []
    for user_id in answers:
        item_id, item_degree = answers[user_id]
        list_item_degress.append(item_degree)
    list_item_degress.sort()
    median_item_degree = list_item_degress[len(list_item_degress) // 2]

    num_cases_full = 0.0
    ndcg_50_full = 0.0
    ndcg_50_half = 0.0
    num_cases_half = 0.0
    hitrate_50_full = 0.0
    hitrate_50_half = 0.0
    for user_id in answers:
        item_id, item_degree = answers[user_id]
        rank = 0
        while rank < 50 and predictions[user_id][rank] != item_id:
            rank += 1
        num_cases_full += 1.0
        if rank < 50:
            ndcg_50_full += 1.0 / np.log2(rank + 2.0)
            hitrate_50_full += 1.0
        if item_degree <= median_item_degree:
            num_cases_half += 1.0
            if rank < 50:
                ndcg_50_half += 1.0 / np.log2(rank + 2.0)
                hitrate_50_half += 1.0
    ndcg_50_full /= num_cases_full
    hitrate_50_full /= num_cases_full
    ndcg_50_half /= num_cases_half
    hitrate_50_half /= num_cases_half
    return np.array([ndcg_50_full, ndcg_50_half,
                     hitrate_50_full, hitrate_50_half], dtype=np.float32)

def get_score_LB(test_label, test_pred, train_click, test_click):
    assert len(test_label) == len(test_pred)
    d = dict(pd.concat([train_click['item_id'], test_click['item_id']]).value_counts())
    n = len(test_label)
    
    y_pred = {test_pred.iloc[i, 0]: list(test_pred.iloc[i, 1:]) for i in range(n)}
    y_true = {test_label['user_id'][i]: 
              (test_label['item_id'][i], 
               d[test_label['item_id'][i]] if test_label['item_id'][i] in d else 0) 
              for i in range(n)}
    
    # ndcg_50_full, ndcg_50_half, hitrate_50_full, hitrate_50_half
    LB = evaluate_each_phase(y_pred, y_true)
    print('[+] NDCG@FULL = {:.6f}, NDCG@HALF = {:.6f}\n[+]  HIT@FULL = {:.6f},  HIT@HALF = {:.6f}'.\
          format(LB[0], LB[1], LB[2], LB[3]))
    return LB





In [3]:
import faiss
def item_df_init(df, vals):
    df_item_emb = pd.DataFrame([])
    df_item_emb['item_id'] = df['item_id']
    df_item_emb['item_emb'] = vals
    df_item_emb['hash_item_id'] = df_item_emb.index
    id2item_dic = dict(zip(df_item_emb['hash_item_id'], df_item_emb['item_id']))
    item2id_vec = dict(zip(df_item_emb['item_id'], df_item_emb['item_emb']))

    cur_dim = len(df_item_emb.iloc[0][1])
    print('current dimension: {}'.format(cur_dim))


    index = faiss.IndexFlatIP(cur_dim) # inner product
    # index = faiss.IndexFlatL2(cur_dim)

    item_embs = np.array(list(df_item_emb['item_emb'].values)).astype('float32')
    faiss.normalize_L2(item_embs)
    index.add(item_embs)
    return df_item_emb, id2item_dic, item2id_vec, index


df_item_emb_list = []
id2item_dic_list = []
item2id_vec_list = []
index_list = []

In [4]:
from glove import Corpus, Glove
def glove_init(all_click):
    df = all_click.sort_values('time').reset_index(drop = True)
    df['item_id'] = df['item_id'].astype(str)
    user_item = df.groupby('user_id')['item_id'].agg(list).reset_index()
    corpus = Corpus() 
    corpus.fit(user_item['item_id'].values, window=20)
    glove_model = Glove(no_components = 300, learning_rate=0.1,random_state = 10)
 
    glove_model.fit(corpus.matrix, epochs=100, no_threads=12, verbose=False)
    glove_model.add_dictionary(corpus.dictionary)
    tmp = pd.DataFrame(corpus.dictionary.keys(), columns = ['item_id']).astype(int)
    vals = pd.DataFrame(glove_model.word_vectors).apply(lambda x: np.array(x).astype('float32'), axis = 1)
    df_item_emb, id2item_dic, item2id_vec, index = item_df_init(tmp, vals)
    df_item_emb_list.append(df_item_emb)
    id2item_dic_list.append(id2item_dic)
    item2id_vec_list.append(item2id_vec)
    index_list.append(index)
    return df_item_emb, id2item_dic, item2id_vec, index

def item_feat_full_init(item_feat):
    tmp = pd.DataFrame(item_feat['item_id']).astype(int)
    vals = item_feat.iloc[:, 1:].apply(lambda x: np.array(x).astype('float32'), axis = 1)
    df_item_emb, id2item_dic, item2id_vec, index = item_df_init(tmp, vals)
    df_item_emb_list.append(df_item_emb)
    id2item_dic_list.append(id2item_dic)
    item2id_vec_list.append(item2id_vec)
    index_list.append(index)
    return df_item_emb, id2item_dic, item2id_vec, index

def item_feat_text_init(item_feat):
    tmp = pd.DataFrame(item_feat['item_id']).astype(int)
    vals = item_feat.loc[:,[col for col in item_feat.columns if 'txt' in col]].apply(lambda x: np.array(x).astype('float32'), axis = 1)
    df_item_emb, id2item_dic, item2id_vec, index = item_df_init(tmp, vals)
    df_item_emb_list.append(df_item_emb)
    id2item_dic_list.append(id2item_dic)
    item2id_vec_list.append(item2id_vec)
    index_list.append(index)
    return df_item_emb, id2item_dic, item2id_vec, index

from gensim.models import Word2Vec
from gensim.models import FastText


def word2vec_init(all_click):
    df = all_click.sort_values('time').reset_index(drop = True)
    df['item_id'] = df['item_id'].astype(str)
    user_item = df.groupby('user_id')['item_id'].agg(list).reset_index()
    model = Word2Vec(user_item['item_id'].values, size=100, window=10, min_count=1, workers=12,
                     seed=1997, iter=50, sg=1, hs=1, compute_loss=True,
                     # min_alpha=0.005
                     )
    my_dict = dict({})
    for idx, key in enumerate(model.wv.vocab):
        my_dict[key] = model.wv[key]

    tmp = pd.DataFrame(my_dict.keys(), columns = ['item_id']).astype(int)
    vals = pd.DataFrame(my_dict.values()).apply(lambda x: np.array(x).astype('float32'), axis = 1)
    df_item_emb, id2item_dic, item2id_vec, index = item_df_init(tmp, vals)
    df_item_emb_list.append(df_item_emb)
    id2item_dic_list.append(id2item_dic)
    item2id_vec_list.append(item2id_vec)
    index_list.append(index)
    return df_item_emb, id2item_dic, item2id_vec, index

def word2vec_w10_init(all_click):
    df = all_click.sort_values('time').reset_index(drop = True)
    df['item_id'] = df['item_id'].astype(str)
    user_item = df.groupby('user_id')['item_id'].agg(list).reset_index()
    model = Word2Vec(user_item['item_id'].values, size=100, window=10, min_count=1, workers=12,
                     seed=1997, iter=50, sg=1, hs=1, compute_loss=True,
                     # min_alpha=0.005
                     )
    my_dict = dict({})
    for idx, key in enumerate(model.wv.vocab):
        my_dict[key] = model.wv[key]

    tmp = pd.DataFrame(my_dict.keys(), columns = ['item_id']).astype(int)
    vals = pd.DataFrame(my_dict.values()).apply(lambda x: np.array(x).astype('float32'), axis = 1)
    df_item_emb, id2item_dic, item2id_vec, index = item_df_init(tmp, vals)
    df_item_emb_list.append(df_item_emb)
    id2item_dic_list.append(id2item_dic)
    item2id_vec_list.append(item2id_vec)
    index_list.append(index)
    return df_item_emb, id2item_dic, item2id_vec, index


def word2vec_split_init(all_click):
    df = all_click.sort_values('time').reset_index(drop = True)
    df['item_id'] = df['item_id'].astype(str)
    df['group'] = np.floor(df['time'] * 10000 - 9800)
    user_item = df.groupby(['user_id','group'])['item_id'].agg(list).reset_index()
    model = Word2Vec(user_item['item_id'].values, size=100, window=10, min_count=1, workers=12,
                     seed=1997, iter=50, sg=1, hs=1, compute_loss=True,
                     # min_alpha=0.005
                     )
    my_dict = dict({})
    for idx, key in enumerate(model.wv.vocab):
        my_dict[key] = model.wv[key]
    tmp = pd.DataFrame(my_dict.keys(), columns = ['item_id']).astype(int)
    vals = pd.DataFrame(my_dict.values()).apply(lambda x: np.array(x).astype('float32'), axis = 1)
    df_item_emb, id2item_dic, item2id_vec, index = item_df_init(tmp, vals)
    df_item_emb_list.append(df_item_emb)
    id2item_dic_list.append(id2item_dic)
    item2id_vec_list.append(item2id_vec)
    index_list.append(index)
    return df_item_emb, id2item_dic, item2id_vec, index

In [5]:
def partial_res(history_click, item2id_vec, id2item_dic, index, query_user, item_set):
    ratio_up, ratio_base = 1, 1
    pos_up = -0.5
    topk = 100
    recall_num = 500
#     item_set = set(history_click['item_id'].unique())
    item_time_list = history_click[['item_id', 'time']].values.tolist()
    item_time_list = [[item, time] for item, time in item_time_list if item in item2id_vec.keys()]
    vec_list = [item2id_vec[item] for item, _ in item_time_list if item in item2id_vec.keys()]
    if len(vec_list) == 0:
        return
    vecs = np.stack(vec_list, axis=0).astype('float32')
    faiss.normalize_L2(vecs)
    D, I = index.search(vecs, topk)
    cur_dic = {}
    for i, (item, time) in enumerate(item_time_list):
        ratio = ratio_up * 1.0 / (i + ratio_base) # (i + 1.) ** (-0.1)
        for k, idx in enumerate(I[i, :]):
            cur_item = id2item_dic[idx]
            pos_score = ratio * ((k + 1) ** pos_up)
                # pos_score = ratio * (1. / (k + 1.))
            if cur_item not in item_set:
                try:
                    cur_dic[cur_item] += D[i, k] * pos_score
                except KeyError as _:
                    cur_dic[cur_item] = D[i, k] * pos_score
    cur_sorted = sorted(cur_dic.items(), key = lambda x: x[1], reverse = True)
    rec_df = pd.DataFrame(cur_sorted[:recall_num],columns = ['item_id', 'sim'])
    rec_df['user_id'] = query_user
    return rec_df

In [6]:
now_phase = 9
file_save_path = path + '/local_result/'

for phase in range(7, now_phase + 1):
    df_item_emb_list = []
    id2item_dic_list = []
    item2id_vec_list = []
    index_list = []
    print(f"phase:{phase}")
    item_feat, item_feat_dict, all_click = local_constr(phase, now_phase)
    query = pd.read_csv(test_path + '/underexpose_test_qtime-{}.csv'.format(phase), names=['user_id', 'time'])
    train_click = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(phase), names=['user_id', 'item_id', 'time'])
    test_click = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(phase), names=['user_id', 'item_id', 'time'])
    phase_click = pd.concat([train_click, test_click], sort = False, axis = 0).reset_index(drop = True)
    
    print("1. glove.")
    df_item_emb, id2item_dic, item2id_vec, index = glove_init(all_click)
    print("2. item all.")
    df_item_emb, id2item_dic, item2id_vec, index = item_feat_full_init(item_feat)
    print("3. text all.")
    df_item_emb, id2item_dic, item2id_vec, index = item_feat_text_init(item_feat)  
    print('4. word2vec.')
    df_item_emb, id2item_dic, item2id_vec, index = word2vec_init(phase_click)
    print('5. word2vec 10w.')
    df_item_emb, id2item_dic, item2id_vec, index = word2vec_w10_init(all_click)
    print('6. word2vec split.')
    df_item_emb, id2item_dic, item2id_vec, index = word2vec_split_init(all_click)
    
    df_list = [pd.DataFrame([])] * len(index_list)
    for query_user, query_time in tqdm(query.values.tolist()):
        history_click = all_click.loc[(all_click['user_id'] == query_user) & (all_click['time'] <= query_time)]
        history_click = history_click.sort_values('time', ascending=False)
        item_set = set(all_click.loc[(all_click['user_id'] == query_user), 'item_id'])
        for i in range(len(df_list)):
            tmp = partial_res(history_click, item2id_vec_list[i], id2item_dic_list[i], 
                              index_list[i], query_user, item_set)
            df_list[i] = df_list[i].append(tmp)
            
    res_glove = df_list[0]
    res_item_all = df_list[1]
    res_item_txt = df_list[2]
    res_w2v = df_list[3]
    res_w2v_10w = df_list[4]
    res_w2v_gr = df_list[5]

    res1 = pd.merge(res_glove.rename(columns={"sim":"sim_glove"}), 
                res_item_all.rename(columns={"sim":"sim_all"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_item_txt.rename(columns={"sim":"sim_txt"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_w2v.rename(columns={"sim":"sim_w2v"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_w2v_10w.rename(columns={"sim":"sim_w2v_10w"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_w2v_gr.rename(columns={"sim":"sim_w2v_gr"}), on = ['user_id', 'item_id'], how = 'outer')


    for col in ['sim_glove', 'sim_all', 'sim_txt', 'sim_w2v', 'sim_w2v_10w', 'sim_w2v_gr']:
        min_score = res1.groupby(['user_id'])[col].transform('min')
        max_score = res1.groupby(['user_id'])[col].transform('max') 
        res1[col] = \
        (res1[col] - min_score)/(max_score - min_score)
    res1.fillna(0, inplace = True)
    res1['score'] = \
            (res1['sim_glove'] + res1['sim_all'] + res1['sim_txt']) * 0.3 + \
            (res1['sim_w2v'] + res1['sim_w2v_10w']) * 0.7 + res1['sim_w2v_gr'] * 0.2
    print("save result:")
    if not os.path.exists(file_save_path):
        os.mkdir(file_save_path)
    res1.to_csv(file_save_path + f'/hybrid_emb_update_v3_test_phase{phase}.csv', index = None)

phase:7
(1243109, 3)
(1243109, 3)
1. glove.
current dimension: 300
2. item all.
current dimension: 256
3. text all.
current dimension: 128
4. word2vec.
current dimension: 300
5. word2vec 10w.
current dimension: 300
6. word2vec split.
current dimension: 300


100%|██████████| 1701/1701 [08:15<00:00,  3.54it/s]


save result:
phase:8
(1243109, 3)
(1243109, 3)
1. glove.
current dimension: 300
2. item all.
current dimension: 256
3. text all.
current dimension: 128
4. word2vec.
current dimension: 300
5. word2vec 10w.
current dimension: 300
6. word2vec split.
current dimension: 300


100%|██████████| 1716/1716 [08:39<00:00,  3.74it/s]


save result:
phase:9
(1243109, 3)
(1243109, 3)
1. glove.
current dimension: 300
2. item all.
current dimension: 256
3. text all.
current dimension: 128
4. word2vec.
current dimension: 300
5. word2vec 10w.
current dimension: 300
6. word2vec split.
current dimension: 300


100%|██████████| 1651/1651 [08:28<00:00,  4.14it/s]


save result:


In [8]:
def local_train_constr(phase, now_phase):
    whole_click = pd.DataFrame()
    for c in range(0, now_phase + 1):
        recom_item = []
        click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(c), header=None,  names=['user_id', 'item_id', 'time'])  
        click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(c), header=None,  names=['user_id', 'item_id', 'time'])
        all_click = click_train.append(click_test)
        all_click = all_click.sort_values(['user_id', 'time']).reset_index(drop = True)
        whole_click = whole_click.append(all_click)
    
    click_train = pd.read_csv(train_path + '/underexpose_train_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])  
    click_test = pd.read_csv(test_path + '/underexpose_test_click-{}.csv'.format(phase), header=None,  names=['user_id', 'item_id', 'time'])
    all_click = click_train.append(click_test).reset_index(drop = True)
    print("phase series:", all_click.shape)
    all_click = all_click.sort_values(['user_id', 'time']).reset_index(drop = True)
    train_answer = all_click.groupby(['user_id']).last().reset_index()
    all_click.set_index(['user_id', 'item_id'], inplace = True)
    all_click = all_click[~all_click.index.isin(train_answer.set_index(['user_id', 'item_id']).index)].reset_index()
    print("phase series:", all_click.shape)
    
    whole_click = whole_click.drop_duplicates(['user_id', 'item_id', 'time']).reset_index(drop = True)
    print("all click:",whole_click.shape)
    whole_click.set_index(['user_id', 'item_id'], inplace = True)
    whole_click = whole_click[~whole_click.index.isin(train_answer.set_index(['user_id', 'item_id']).index)].reset_index()
    whole_click = whole_click.sort_values(['user_id', 'time']).reset_index(drop = True)
    print("all click:",whole_click.shape)
    
    item_feat = pd.read_csv('/root/kdd_cup_2020/KDD_online_LB/data/item_feat_normalize.csv')
    item_feat_dict = item_embedding_dict(item_feat)  
    return item_feat, item_feat_dict, whole_click, all_click, train_answer

In [9]:
now_phase = 9
for phase in range(7, now_phase + 1):
    df_item_emb_list = []
    id2item_dic_list = []
    item2id_vec_list = []
    index_list = []
    print(f"train phase:{phase}")
    item_feat, item_feat_dict, all_click, phase_click, train_answer = local_train_constr(phase, now_phase)
    query = train_answer[['user_id', 'time']]    
    
    print("1. glove.")
    df_item_emb, id2item_dic, item2id_vec, index = glove_init(all_click)
    print("2. item all.")
    df_item_emb, id2item_dic, item2id_vec, index = item_feat_full_init(item_feat)
    print("3. text all.")
    df_item_emb, id2item_dic, item2id_vec, index = item_feat_text_init(item_feat)  
    print('4. word2vec.')
    df_item_emb, id2item_dic, item2id_vec, index = word2vec_init(phase_click)
    print('5. word2vec 10w.')
    df_item_emb, id2item_dic, item2id_vec, index = word2vec_w10_init(all_click)
    print('6. word2vec split.')
    df_item_emb, id2item_dic, item2id_vec, index = word2vec_split_init(all_click)
        
    df_list = [pd.DataFrame([])] * len(index_list)
    for query_user, query_time in tqdm(query.values.tolist()):
        history_click = all_click.loc[(all_click['user_id'] == query_user) & (all_click['time'] <= query_time)]
        history_click = history_click.sort_values('time', ascending=False)
        item_set = set(all_click.loc[(all_click['user_id'] == query_user), 'item_id'])
        for i in range(len(df_list)):
            tmp = partial_res(history_click, item2id_vec_list[i], id2item_dic_list[i], 
                              index_list[i], query_user, item_set)
            df_list[i] = df_list[i].append(tmp)
            
    res_glove = df_list[0]
    res_item_all = df_list[1]
    res_item_txt = df_list[2]
    res_w2v = df_list[3]
    res_w2v_10w = df_list[4]
    res_w2v_gr = df_list[5]

    res1 = pd.merge(res_glove.rename(columns={"sim":"sim_glove"}), 
                res_item_all.rename(columns={"sim":"sim_all"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_item_txt.rename(columns={"sim":"sim_txt"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_w2v.rename(columns={"sim":"sim_w2v"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_w2v_10w.rename(columns={"sim":"sim_w2v_10w"}), on = ['user_id', 'item_id'], how = 'outer')
    res1 = pd.merge(res1, res_w2v_gr.rename(columns={"sim":"sim_w2v_gr"}), on = ['user_id', 'item_id'], how = 'outer')


    for col in ['sim_glove', 'sim_all', 'sim_txt', 'sim_w2v', 'sim_w2v_10w', 'sim_w2v_gr']:
        min_score = res1.groupby(['user_id'])[col].transform('min')
        max_score = res1.groupby(['user_id'])[col].transform('max') 
        res1[col] = \
        (res1[col] - min_score)/(max_score - min_score)
    res1.fillna(0, inplace = True)
    res1['score'] = \
            (res1['sim_glove'] + res1['sim_all'] + res1['sim_txt']) * 0.3 + \
            (res1['sim_w2v'] + res1['sim_w2v_10w']) * 0.7 + res1['sim_w2v_gr'] * 0.2
    print("save result:")
    if not os.path.exists(file_save_path):
        os.mkdir(file_save_path)
    res1.to_csv(file_save_path + f'/hybrid_emb_update_v3_train_phase{phase}.csv', index = None)

#     break

train phase:7
phase series: (294763, 3)
phase series: (275093, 3)
all click: (1243109, 3)
all click: (1218677, 3)
1. glove.
current dimension: 300
2. item all.
current dimension: 256
3. text all.
current dimension: 128
4. word2vec.
current dimension: 300
5. word2vec 10w.
current dimension: 300
6. word2vec split.
current dimension: 300


100%|██████████| 19670/19670 [2:07:07<00:00,  2.21it/s]  


save result:
