<a href="https://colab.research.google.com/github/ZhouNLP/tcnlp/blob/master/lstm_model/word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

使用gensim训练word2vec词向量的代码，除了gensim参数以外，跟天池论坛的基本一致

https://tianchi.aliyun.com/notebook-ai/detail?spm=5176.12586969.1002.18.64065cbbRfhdqJ&postId=118268



In [None]:
import logging
import random
import time
# import sys
# sys.path.append('/home/aistudio/external-libraries')
import numpy as np


logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# set seed 
seed = 666
random.seed(seed)
np.random.seed(seed)
# torch.cuda.manual_seed(seed)
# torch.manual_seed(seed)

In [None]:
# split data to 10 fold
fold_num = 10
data_file = 'train_set.csv'
import pandas as pd


def all_data2fold(fold_num, num=200000):
    fold_data = []
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    texts = f['text'].tolist()[:num]
    labels = f['label'].tolist()[:num]

    total = len(labels)

    index = list(range(total))
    np.random.shuffle(index)

    all_texts = []
    all_labels = []
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])

    label2id = {}
    for i in range(total):
        label = str(all_labels[i])
        if label not in label2id:
            label2id[label] = [i]
        else:
            label2id[label].append(i)

    all_index = [[] for _ in range(fold_num)]
    for label, data in label2id.items():
        # print(label, len(data))
        batch_size = int(len(data) / fold_num)
        other = len(data) - batch_size * fold_num
        for i in range(fold_num):
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            all_index[i].extend(batch_data)

    batch_size = int(total / fold_num)
    other_texts = []
    other_labels = []
    other_num = 0
    start = 0
    for fold in range(fold_num):
        num = len(all_index[fold])
        texts = [all_texts[i] for i in all_index[fold]]
        labels = [all_labels[i] for i in all_index[fold]]

        if num > batch_size:
            fold_texts = texts[:batch_size]
            other_texts.extend(texts[batch_size:])
            fold_labels = labels[:batch_size]
            other_labels.extend(labels[batch_size:])
            other_num += num - batch_size
        elif num < batch_size:
            end = start + batch_size - num
            fold_texts = texts + other_texts[start: end]
            fold_labels = labels + other_labels[start: end]
            start = end
        else:
            fold_texts = texts
            fold_labels = labels

        assert batch_size == len(fold_labels)

        # shuffle
        index = list(range(batch_size))
        np.random.shuffle(index)

        shuffle_fold_texts = []
        shuffle_fold_labels = []
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])

        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        fold_data.append(data)

    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))

    return fold_data


fold_data = all_data2fold(10)

In [None]:
# build train data for word2vec
fold_id = 10

train_texts = []
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    
logging.info('Total %d docs.' % len(train_texts))

In [None]:
test_pd = pd.read_csv('test_a.csv', sep='\t', encoding='UTF-8') # 加入了测试集A

In [None]:
for i in test_pd['text']:
    train_texts.append(i)
logging.info('Total %d docs.' % len(train_texts))

In [None]:
# 为了看到训练时的loss，加入了回调函数，但是似乎有BUG，训练多了loss会变为0

from gensim.models.callbacks import CallbackAny2Vec
class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 1
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        with open('w2vlog.txt', 'a+') as f:
            f.write('Loss after epoch {}: {}'.format(self.epoch, loss_now)+'\n')
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),'Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1

In [None]:
logging.info('Start training...')
logger = logging.getLogger()
logger.setLevel(logging.WARNING)
from gensim.models.word2vec import Word2Vec

num_features = 200     # Word vector dimensionality
num_workers = 12       # Number of threads to run in parallel

train_texts_ = list(map(lambda x: list(x.split()), train_texts))

# 采用skip-gram，负采样，训练10轮，12核CPU大约需要4个小时
model = Word2Vec(train_texts_, sg=1, workers=num_workers, size=num_features, compute_loss=True,
                 callbacks=[callback()], iter=10, hs=0, window=10)
model.init_sims(replace=True)

# save model
model.save("./word2vec.bin")

In [None]:
# load model
model = Word2Vec.load("./word2vec.bin")

# convert format
model.wv.save_word2vec_format('./word2vec.txt', binary=False)