In [124]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import pandas as pd
import os

In [2]:
os.path.abspath('..')

'/Users/wenyi/Desktop/个人/学习/常用算法'

In [4]:
path = os.path.abspath('..')
rating_path = os.path.join(path, 'data/ml-1m/ratings.dat')
rating = pd.read_csv(rating_path, sep='::', names=['user','item', 'ratings', 'timestamp'])

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
rating.head()

Unnamed: 0,user,item,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [176]:
class SkipGramModel(nn.Module):
    """
    Skip gram model of word2vec
    """
    def __init__(self, vocab_size, embed_dim):
        """
        Init parameter
        vocab_size: word numberv of vocabulary
        embed_dim: Embedding dimension, typically from 50 to 500
        """
        super(SkipGramModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        
        # input embedding project input word to a vector
        self.input_embed = nn.Embedding(vocab_size, embed_dim, sparse=True)
        # output embedding project output word to a vector
        self.output_embed = nn.Embedding(vocab_size, embed_dim, sparse=True)
        
    def init_embed(self):
        """
        Init embedding weight
        input_embed is a uniform distribution is [-0.5/vocab_size, 0.5/vocab_size]
        """
        init_range = 0.5/self.vocab_size
        self.input_embed.weight.data.uniform_(-init_range, init_range)
        self.output_embed.weight.data.uniform_(-0,0)
        
    def forward(self, pos_center_word, pos_neighbor_word, neg_neighbor_word):
        """
        pos_center_word: list of center word id [1,3,5,...]
        pos_neighbor_word: list of neighbor word id [3,5,6,...]
        neg_neighbor_word: list[list] of negative neigbor word by negative sample the inner list is every
        pos_neighbor_word sample n negative neighbor word [[2,4,5,7,8],[1,2,3,4,6],[1,2,3,4,5],...]
        """
        # input word(center word) embeding
        pos_center_embed = self.input_embed(pos_center_word)
        # caculate positive sample loss
        pos_neighbor_embed = self.output_embed(pos_neighbor_word)
        pos_score = torch.mul(pos_center_embed, pos_neighbor_embed)
        pos_score = torch.sum(pos_score, dim=-1)
        pos_score = F.logsigmoid(pos_score)
        
        # caculate negative sample loss
        # neg_neighbor_embed is a three dimension matrix different to pos_neighbor_embed
        neg_neighbor_embed = self.output_embed(neg_neighbor_word) 
        neg_score = torch.bmm(neg_neighbor_embed, pos_center_embed.unsqueeze(2)).squeeze()
        neg_score = F.logsigmoid(-1*neg_score)
        
        # loss
        loss = -1*(torch.sum(pos_score) + torch.sum(neg_score))
        return loss

In [187]:
class Word2Vec:
    """
    Word2vec model
    """
    def __init__(self, data,batch_size=128 ,iters=10,learning_rate=0.01,embeding_dim=100,windows_size=5, min_count=5):
        """
        data: list[list] train data word id list for sentence
        embeding_dim: embedding dimension
        windows size: int, windows number for search neighbor word of a given center word
        min_count: int, filter word if word frequency less than min_count 
        """
        self.data = data
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embeding_dim = embeding_dim
        self.windows_size = windows_size
        self.min_count = min_count
        self.iters = iters
        self.word2id = dict()
        self.id2word = dict()
        self.wordfrequency = self.dataprocess(data)
        self.vocab_size = len(self.wordfrequency)
        self.sample_table = self.init_sample_table()
        self.model = SkipGramModel(self.vocab_size+1, self.embeding_dim)
        self.optim = optim.SGD(self.model.parameters(), self.learning_rate)
    
    def dataprocess(self, data):
        """
        filter word frequency less than min count and than construct a new word frequency dict
        note:
            the new word frequency dict is use the new word2id index project and we must save the
            word2id and id2word project dict
        """
        # count the data word frequency
        wordfrequency = dict()
        # filter word and use the new index to construct the dict
        word_frequency = dict()
        
        for sentence in data:
            for word in sentence:
                if word not in wordfrequency:
                    wordfrequency[word] = 1
                else:
                    wordfrequency[word] += 1
                    
        idx = 0
        for word, val in wordfrequency.items():
            if val < self.min_count:
                continue
            self.word2id[word] = val
            self.id2word[idx] = word
            word_frequency[idx] = val
            idx += 1
        return word_frequency
    
    def init_sample_table(self):
        """
        init teh sample tabel for negative sample 
        note:
            sample_table_size is 10e8 like the word2vec doc and sample ratio is also use the offical paper
        """
        sample_table_size = 10e8
        sample_table = []
        pow_frequency = np.array(list(self.wordfrequency.values()))**0.75
        ratio = pow_frequency/sum(pow_frequency)
        count = np.round(ratio*sample_table_size)
        for i, c in enumerate(count):
            sample_table += [i] * int(c)
        sample_tabel = np.array(sample_table)
        return sample_table
        
    def generate_train_data(self):
        """
        generate the train data like (center word, neighbor word) pairs
        """
        train_data = []
        for sentence in self.data:
            for i, word_u in enumerate(sentence):
                for j, word_v in enumerate(sentence[max(i-self.windows_size,0):i+self.windows_size]):
                    if i == j:
                        continue
                    train_data.append((word_u, word_v))
        return train_data
    
    def negative_sample_batch(self):
        """
        the negative sample without ignore the target word(may be the negative sample word is the same
        to the neighbor word)
        """
        return np.random.choice(self.sample_table, size=(self.batch_size, 5)).tolist()
    
    def train_batch(self, train_data):
        """
        a generator for generate the train batch like [(center_word, neighbor_word),(center_word, neighbor_word)]
        """
        iters = len(train_data)//self.batch_size + 1
        for i in range(iters-1):
            start = i*self.batch_size
            end = (i+1)*self.batch_size
            yield train_data[start:end]
    
    def train(self):
        for k in range(self.iters):
            train_data = self.generate_train_data()
            dataloader = self.train_batch(train_data)
            for i, data in enumerate(dataloader):
                pos_centor_word = Variable(torch.LongTensor([pair[0] for pair in data]))
                pos_neighbor_word = Variable(torch.LongTensor([pair[1] for pair in data]))
                neg_neighbor_word = Variable(torch.LongTensor(self.negative_sample_batch()))
                self.optim.zero_grad()
                loss = self.model(pos_centor_word, pos_neighbor_word, neg_neighbor_word)
                loss.backward()
                self.optim.step()
            print("Epoch %d is finished" %(k+1))

In [188]:
# 测试数据
data = np.random.randint(1,50,size=(1000,8)).tolist()

In [189]:
word2vec = Word2Vec(data)

In [190]:
%%time
word2vec.train()

Epoch 1 is finished
Epoch 2 is finished
Epoch 3 is finished
Epoch 4 is finished
Epoch 5 is finished
Epoch 6 is finished
Epoch 7 is finished
Epoch 8 is finished
Epoch 9 is finished
Epoch 10 is finished
CPU times: user 1min 19s, sys: 690 ms, total: 1min 19s
Wall time: 27.5 s


In [194]:
# the vecotr of every word after training 
word2vec.model.input_embed.weight

Parameter containing:
tensor([[-0.7634, -0.5300,  0.3788,  ..., -0.0918,  0.1975, -0.7149],
        [ 0.6912, -0.1294,  0.2411,  ...,  0.6027,  0.2645,  0.0246],
        [-0.0079,  0.6770,  0.3081,  ...,  0.1047, -0.3630, -0.7708],
        ...,
        [ 1.0373,  0.1653,  0.4157,  ...,  0.5860, -0.7653, -0.2151],
        [-0.1928,  0.0902,  0.5079,  ...,  0.5305,  1.3487,  0.0293],
        [-0.4178,  0.5648,  1.0892,  ...,  0.5870, -0.8406, -0.6159]],
       requires_grad=True)