Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

有无生成word_embedding.txt 文件的代码 #5

Closed
xxllp opened this issue Jul 27, 2022 · 3 comments
Closed

有无生成word_embedding.txt 文件的代码 #5

xxllp opened this issue Jul 27, 2022 · 3 comments

Comments

@xxllp
Copy link

xxllp commented Jul 27, 2022

如题 这块是否可用支持新的数据集

@Buted
Copy link
Owner

Buted commented Jul 28, 2022

供参考

#!/usr/bin/env/ python
# -*- coding:utf-8 -*-

import numpy as np
import os


class GlobeTransformer:
    def __init__(self, globe_filename, corpus_filename):
        self.globe_array = {}
        self.generate_globe_array(globe_filename)
        self.word2id = {}
        self.generate_corpus_word2vec(corpus_filename)

    @staticmethod
    def read_file(filename):
        data = []
        with open(filename, 'r', encoding='utf-8') as reader:
            lines = reader.readlines()
            for l in lines:
                data.append(l.strip('\n'))
        return data

    def generate_globe_array(self, globe_filename):
        for word_info in GlobeTransformer.read_file(globe_filename)[1:]:
            array_info = word_info.split(' ')
            word = array_info[0]
            vec = array_info[1:]
            if 'name' in vec[1]:
                print(word_info.split(' ')[:4])
            self.globe_array[word] = vec

    def generate_corpus_word2vec(self, corpus_filename):
        for word_info in GlobeTransformer.read_file(corpus_filename):
            word = ' '.join(word_info.split()[:-1])
            word_id = int(word_info.split()[-1])
            self.word2id[word] = word_id

    def save_corpus_emb(self, emb_filename):
        def rand_emb():
            return np.random.random_sample(300).tolist()

        emb_array = ['']*len(self.word2id)
        emb_size = 300
        for word, word_id in self.word2id.items():
            if word == '<pad>':
                emb_array[0] = ' '.join(['0']*emb_size)
                continue
            if word in self.globe_array:
                emb_array[word_id] = ' '.join(self.globe_array[word])
            else:
                emb_array[word_id] = ' '.join(list(map(str, rand_emb())))
        print(len(emb_array))
        GlobeTransformer.write_emb_file(emb_array, emb_filename)

    @staticmethod
    def write_emb_file(emb_array, filename):
        with open(filename, 'w', encoding='utf-8') as writter:
            for line in emb_array:
                writter.write(line+'\n')


if __name__ == '__main__':
    globe = 'glove.840B.300d.txt'
    source = 'ACE'
    corpus_word2id = os.path.join(source, 'word_class_id.txt')
    save_filename = os.path.join(source, 'word_embedding.txt')
    GlobeTransformer(globe, corpus_word2id).save_corpus_emb(save_filename)

@Buted Buted closed this as completed Aug 1, 2022
@xxllp
Copy link
Author

xxllp commented Aug 8, 2022

这个跟论文里面里面 使用模型训练获取enbedding 的不太一样吧

@xxllp
Copy link
Author

xxllp commented Aug 9, 2022

代码里面也没包含其他几个文件的生成方式

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants