有无生成word_embedding.txt 文件的代码 #5

xxllp · 2022-07-27T10:10:32Z

如题这块是否可用支持新的数据集

Buted · 2022-07-28T12:57:39Z

供参考

#!/usr/bin/env/ python
# -*- coding:utf-8 -*-

import numpy as np
import os


class GlobeTransformer:
    def __init__(self, globe_filename, corpus_filename):
        self.globe_array = {}
        self.generate_globe_array(globe_filename)
        self.word2id = {}
        self.generate_corpus_word2vec(corpus_filename)

    @staticmethod
    def read_file(filename):
        data = []
        with open(filename, 'r', encoding='utf-8') as reader:
            lines = reader.readlines()
            for l in lines:
                data.append(l.strip('\n'))
        return data

    def generate_globe_array(self, globe_filename):
        for word_info in GlobeTransformer.read_file(globe_filename)[1:]:
            array_info = word_info.split(' ')
            word = array_info[0]
            vec = array_info[1:]
            if 'name' in vec[1]:
                print(word_info.split(' ')[:4])
            self.globe_array[word] = vec

    def generate_corpus_word2vec(self, corpus_filename):
        for word_info in GlobeTransformer.read_file(corpus_filename):
            word = ' '.join(word_info.split()[:-1])
            word_id = int(word_info.split()[-1])
            self.word2id[word] = word_id

    def save_corpus_emb(self, emb_filename):
        def rand_emb():
            return np.random.random_sample(300).tolist()

        emb_array = ['']*len(self.word2id)
        emb_size = 300
        for word, word_id in self.word2id.items():
            if word == '<pad>':
                emb_array[0] = ' '.join(['0']*emb_size)
                continue
            if word in self.globe_array:
                emb_array[word_id] = ' '.join(self.globe_array[word])
            else:
                emb_array[word_id] = ' '.join(list(map(str, rand_emb())))
        print(len(emb_array))
        GlobeTransformer.write_emb_file(emb_array, emb_filename)

    @staticmethod
    def write_emb_file(emb_array, filename):
        with open(filename, 'w', encoding='utf-8') as writter:
            for line in emb_array:
                writter.write(line+'\n')


if __name__ == '__main__':
    globe = 'glove.840B.300d.txt'
    source = 'ACE'
    corpus_word2id = os.path.join(source, 'word_class_id.txt')
    save_filename = os.path.join(source, 'word_embedding.txt')
    GlobeTransformer(globe, corpus_word2id).save_corpus_emb(save_filename)

xxllp · 2022-08-08T09:34:53Z

这个跟论文里面里面使用模型训练获取enbedding 的不太一样吧

xxllp · 2022-08-09T09:40:42Z

代码里面也没包含其他几个文件的生成方式

Buted closed this as completed Aug 1, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

有无生成word_embedding.txt 文件的代码 #5

有无生成word_embedding.txt 文件的代码 #5

xxllp commented Jul 27, 2022

Buted commented Jul 28, 2022

xxllp commented Aug 8, 2022

xxllp commented Aug 9, 2022

有无生成word_embedding.txt 文件的代码 #5

有无生成word_embedding.txt 文件的代码 #5

Comments

xxllp commented Jul 27, 2022

Buted commented Jul 28, 2022

xxllp commented Aug 8, 2022

xxllp commented Aug 9, 2022