In [23]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/chiyi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
##加载所有数据
from nltk.corpus import wordnet
import torch

def load_node():
    """加载所有的词节点"""
    nodes = []
    for synset in wordnet.all_synsets():
        nodes.append(synset.name())

        ##加载所有可能的同义词
    lemmas = list(wordnet.all_lemma_names())
    # 去重
    node = set(nodes + lemmas)
    node = list(node)
    #编码
    node = {name: i for i, name in enumerate(node)}
    
    return node

node = load_node()
len(node), node['dog.n.01']

(264965, 115454)

In [25]:
def load_edge():
    #所有的关系
    edge_fs = [
        'synsets', 'hypernyms', 'hyponyms', 'instance_hyponyms',
        'member_meronyms', 'part_meronyms', 'topic_domains', 'usage_domains',
        'region_domains', 'attributes', 'entailments', 'causes', 'also_sees',
        'verb_groups', 'similar_tos', 'lemma_names'
    ]

    edge = []

    #全体词表
    for n1 in node.keys():

        #遍历同义词
        for synset in wordnet.synsets(n1):
            n2 = synset.name().split('.')[0]

            #添加同义词关系
            edge.append((n1, n2))

            #遍历所有可能的关系
            for f in edge_fs:
                #同义词刚刚已经处理过了,这里不再重复
                if f == 'synsets':
                    continue

                #调用同义词上的edge函数
                for n2 in getattr(synset, f)():
                    if f == 'lemma_names':
                        n2 = n2.lower()
                    else:
                        n2 = n2.name().split('.')[0]
                    edge.append((n1, n2))

    # #删除所有自己到自己的关联
    # #编码
    # edge = [(node[i[0]], node[i[1]]) for i in edge if i[0] != i[1]]

    return edge


edge = load_edge()

len(edge), edge[:15]

(1347037,
 [('joseph_emerson_worcester', 'worcester'),
  ('joseph_emerson_worcester', 'worcester'),
  ('joseph_emerson_worcester', 'joseph_emerson_worcester'),
  ('to_wit', 'namely'),
  ('to_wit', 'namely'),
  ('to_wit', 'viz.'),
  ('to_wit', 'that_is_to_say'),
  ('to_wit', 'to_wit'),
  ('to_wit', 'videlicet'),
  ('moving_stairway', 'escalator'),
  ('moving_stairway', 'stairway'),
  ('moving_stairway', 'escalator'),
  ('moving_stairway', 'moving_staircase'),
  ('moving_stairway', 'moving_stairway'),
  ('realtor', 'realtor')])

In [26]:
###出现部分缩写单词没有在收录之中

Miss = []
for n1, n2 in edge:
    if n1 not in node :
        Miss.append(n1)
    if n2 not in node :
        Miss.append(n2)
Miss = set(Miss)

print("缺失的节点:", sorted(Miss))
##出现类似格式错乱的词，选择移除
##编码
edge = [(node[n1], node[n2]) for n1,n2 in edge if n1 and n2 not in Miss and n1 != n2]
edge

缺失的节点: ['', 'ibid', 'lake_st', 'st', 'wrangell-st']


[(0, 27498),
 (0, 27498),
 (4, 57725),
 (4, 57725),
 (4, 257971),
 (4, 242245),
 (4, 222969),
 (5, 161147),
 (5, 22424),
 (5, 161147),
 (5, 186763),
 (6, 189501),
 (6, 232342),
 (7, 183988),
 (9, 133809),
 (9, 1893),
 (10, 139716),
 (11, 99538),
 (12, 67706),
 (12, 1336),
 (12, 6568),
 (12, 208236),
 (12, 249266),
 (12, 240818),
 (12, 249266),
 (12, 144122),
 (12, 67706),
 (12, 56810),
 (12, 31031),
 (12, 216008),
 (12, 259745),
 (12, 231792),
 (12, 6353),
 (12, 231792),
 (12, 67706),
 (12, 15486),
 (12, 113588),
 (12, 197512),
 (12, 240047),
 (12, 15486),
 (12, 253092),
 (12, 67706),
 (12, 208236),
 (12, 227680),
 (12, 53717),
 (12, 74658),
 (12, 62214),
 (12, 263269),
 (12, 53717),
 (12, 225521),
 (12, 75234),
 (12, 161829),
 (12, 67706),
 (12, 25427),
 (12, 114615),
 (12, 25427),
 (12, 38172),
 (13, 94074),
 (13, 252975),
 (13, 9777),
 (17, 56668),
 (17, 142377),
 (17, 185023),
 (17, 187450),
 (17, 248779),
 (17, 240515),
 (18, 52001),
 (18, 187512),
 (18, 214823),
 (20, 40862),
 (2

In [27]:
import torch
import random
##构建负采样集

edge_Set = set(edge)  # 使用集合去重
def get_batch(batch_size = 128):
    """从边集中随机采样负样本"""
    neg = []
    pos =[]
    sample = random.sample(edge, batch_size)  #  随机选择4条边
    ##  一个正节点对应两个负样本
    for i in sample:
        n1, n2 = i
        # 将正样本添加到正样本列表中
        pos.append((n1, n2))
        pos.append((n1, n2))#构建无序
        # 随机选择一个节点作为负样本
        while (n1, n2) in edge_Set:
            # neg_node = random.choice(list(node.values()))##当出现数据量大的时候会拖慢
            neg_node = random.randint(0,len(node)-1)
            # print(neg_node)
            if neg_node != n2 and neg_node != n1:               
                neg.append((n1, neg_node))
            if  neg_node != n1 and neg_node != n2:
                neg.append((neg_node, n2))
                break
    pos = torch.LongTensor(pos)
    neg = torch.LongTensor(neg)
    return pos, neg

get_batch()

(tensor([[217288,  37280],
         [217288,  37280],
         [242045, 121249],
         [242045, 121249],
         [228722,  33847],
         [228722,  33847],
         [224542,  90405],
         [224542,  90405],
         [226610, 247720],
         [226610, 247720],
         [ 15681,  95707],
         [ 15681,  95707],
         [183027,  96443],
         [183027,  96443],
         [ 26529, 255920],
         [ 26529, 255920],
         [260067, 220486],
         [260067, 220486],
         [157462,  71344],
         [157462,  71344],
         [132161, 162977],
         [132161, 162977],
         [190605, 105150],
         [190605, 105150],
         [181549, 132810],
         [181549, 132810],
         [ 53871, 261728],
         [ 53871, 261728],
         [152919, 159748],
         [152919, 159748],
         [264377, 218331],
         [264377, 218331],
         [  9235, 140702],
         [  9235, 140702],
         [260730, 137382],
         [260730, 137382],
         [ 73562, 178371],
 

In [28]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # 定义embeding层，embed的数量是词汇量，转化成维度2
        self.embed = torch.nn.Embedding(num_embeddings=len(node),embedding_dim=150)

        #初始化参数
        self.embed.weight.data.uniform_(-0.01,0.01)

    def forward(self,pos,neg):
        #编码   
        ##[8,6] -> [8,6, 2]
        x = self.embed(pos)
        #[8, 2]-> [8, 2 , 1]
        pos_n1 = x[:,0].unsqueeze(dim = 1)
        #[8,2]-> [8, 2 , 1]
        pos_n2 = x[:,1].unsqueeze(dim = 2)
        
        loss_pos = torch.bmm(pos_n1, pos_n2).squeeze()

       
        ##[8,12] -> [8,12,2]
        y = self.embed(neg)
        #[8, 1, 2]
        neg_n1 = y[:, 0].unsqueeze(dim=1)
        #[8, 2, 1]
        neg_n2 = y[:, 1].unsqueeze(dim=2)
        
        #[8, 1, 2],[8, 2, 1] -> [8]
        loss_neg = torch.bmm(neg_n1, -neg_n2).squeeze()

        loss_pos = loss_pos.sigmoid().log().sum()
        #[8, 12] -> [8]
        loss_neg = loss_neg.sigmoid().log().sum()
        #组合两部分的loss
        #[8] + [8] -> [1]
        loss = (loss_pos + loss_neg).mean()

        return -loss
        

In [29]:
model = Model()
model(*get_batch())


tensor(354.8972, grad_fn=<NegBackward0>)

In [30]:
def test(test_words):
    embed = model.embed.weight.data.clone()
    
    
    for word in test_words:
        x = embed[node[word]]
        score = torch.nn.functional.cosine_similarity(x, embed)
        topk = score.topk(k=5).indices
        topk = [list(node.keys())[k] for k in topk]
        print(word, topk)


test(['girl', 'bus', 'green', 'doctor', 'dog', 'queen', 'italy'])

girl ['girl', 'deviance', 'unfit', 'patriarchic', 'potbound']
bus ['bus', 'basil.n.02', 'great_white_heron.n.02', 'redound', 'genus_psittacosaurus.n.01']
green ['green', 'then.r.02', 'hypocycloid', 'petechia', 'interaction']
doctor ['doctor', "dove's_foot_geranium.n.01", 'quarter.v.01', 'scraping', 'spoken']
dog ['dog', 'impermissibly.r.01', 'hyman_george_rickover', 'family_dryopteridaceae', 'heart.n.06']
queen ['queen', 'amphisbaena.n.01', 'common_water_snake', 'heteropterous_insect.n.01', 'iridescence']
italy ['italy', 'biology_laboratory', 'pol', 'boa_constrictor.n.01', 'gabriele_fallopius']


In [34]:
from tqdm   import tqdm
def train():
    global model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_sum = 0
    for epoch in tqdm(range(200001)):
        batch = get_batch(batch_size=256)
        batch = [i.to(device) for i in batch]

        loss = model(*batch)
        optimizer.zero_grad()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        loss_sum += loss.item()

        if epoch % 50000 == 0:
            print(epoch, loss_sum/50000)
            test(['girl', 'bus', 'green', 'doctor', 'dog', 'queen', 'italy'])
            loss_sum = 0
            
        if epoch % 100000 == 0:
            torch.save(model.cpu(), 'models/wordnet_%d.model' % epoch)
            model = model.to(device)
    model = model.cpu()


train()

  0%|          | 0/200001 [00:00<?, ?it/s]

0 0.00773256591796875
girl ['girl', 'fille', 'young_lady', 'young_woman', 'missy']
bus ['bus', 'double-decker', 'autobus', 'jalopy', 'motorbus']
green ['green', 'greens', 'william_green', 'viridity', 'greenishness']
doctor ['doctor', 'dr.', 'doc', 'medico', 'physician']
dog ['dog', 'domestic_dog', 'canis_familiaris', 'frump', 'pug']
queen ['queen', 'queens', 'queen_regnant', 'queen_mole_rat', 'world-beater']
italy ['italy', 'italian_republic', 'italia', 'italian_peninsula', 'basilicata']


 25%|██▌       | 50012/200001 [04:32<17:08, 145.89it/s]

50000 329.3192362426758
girl ['girl', 'fille', 'miss', 'young_woman', 'missy']
bus ['bus', 'motorbus', 'buss', 'omnibus', 'charabanc']
green ['green', 'greens', 'viridity', 'greenness', 'chrome_green']
doctor ['doctor', 'dr.', 'physician', 'doc', 'medico']
dog ['dog', 'domestic_dog', 'canis_familiaris', 'great_pyrenees', 'coach_dog']
queen ['queen', 'queens', 'queen_consort', 'female_monarch', 'naked_mole_rat']
italy ['italy', 'italian_republic', 'italia', 'basilicata', 'italian_region']


 50%|████▉     | 99990/200001 [09:04<08:58, 185.67it/s]

100000 293.0110279507446
girl ['girl', 'miss', 'young_lady', 'missy', 'young_woman']
bus ['bus', 'buss', 'motorcoach', 'autobus', 'motorbus']
green ['green', 'greens', 'greenness', 'commons', 'cook']
doctor ['doctor', 'physician', 'dr.', 'medico', 'doc']
dog ['dog', 'domestic_dog', 'canis_familiaris', 'go_after', 'pursue']
queen ['queen', 'queens', 'queen_regnant', 'female_aristocrat', 'royal_line']
italy ['italy', 'italian_republic', 'italia', 'lucania', 'matterhorn']


 75%|███████▌  | 150004/200001 [13:36<05:35, 149.24it/s]

150000 280.0266804754639
girl ['girl', 'young_lady', 'young_woman', 'missy', 'chachka']
bus ['bus', 'buss', 'omnibus', 'motorbus', 'motorcoach']
green ['green', 'greens', 'viridity', 'fleeceable', 'greenness']
doctor ['doctor', 'dr.', 'physician', 'doc', 'md']
dog ['dog', 'domestic_dog', 'chase', 'go_after', 'canis_familiaris']
queen ['queen', 'queens', 'pansy', 'poove', 'female_monarch']
italy ['italy', 'italia', 'italian_republic', 'italian_region', 'tivoli']


100%|█████████▉| 199989/200001 [18:08<00:00, 182.78it/s]

200000 273.6528269128418
girl ['girl', 'young_woman', 'miss', 'young_lady', 'fille']
bus ['bus', 'buss', 'motorbus', 'heaps', 'heap']
green ['green', 'greens', 'wild_spinach', 'viridity', 'commons']
doctor ['doctor', 'dr.', 'md', 'physician', 'doc']
dog ['dog', 'go_after', 'domestic_dog', 'frank', 'canis_familiaris']
queen ['queen', 'queens', 'queer', 'pouf', 'fagot']
italy ['italy', 'italia', 'italian_republic', 'tivoli', 'brindisi']


100%|██████████| 200001/200001 [18:08<00:00, 183.69it/s]


In [35]:
model = torch.load('models/wordnet_200000.model')

test(['girl', 'bus', 'green', 'doctor', 'dog', 'queen', 'italy'])

girl ['girl', 'young_woman', 'miss', 'young_lady', 'fille']
bus ['bus', 'buss', 'motorbus', 'heaps', 'heap']
green ['green', 'greens', 'wild_spinach', 'viridity', 'commons']
doctor ['doctor', 'dr.', 'md', 'physician', 'doc']
dog ['dog', 'go_after', 'domestic_dog', 'frank', 'canis_familiaris']
queen ['queen', 'queens', 'queer', 'pouf', 'fagot']
italy ['italy', 'italia', 'italian_republic', 'tivoli', 'brindisi']
