-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
有无生成word_embedding.txt 文件的代码 #5
Comments
供参考 #!/usr/bin/env/ python
# -*- coding:utf-8 -*-
import numpy as np
import os
class GlobeTransformer:
def __init__(self, globe_filename, corpus_filename):
self.globe_array = {}
self.generate_globe_array(globe_filename)
self.word2id = {}
self.generate_corpus_word2vec(corpus_filename)
@staticmethod
def read_file(filename):
data = []
with open(filename, 'r', encoding='utf-8') as reader:
lines = reader.readlines()
for l in lines:
data.append(l.strip('\n'))
return data
def generate_globe_array(self, globe_filename):
for word_info in GlobeTransformer.read_file(globe_filename)[1:]:
array_info = word_info.split(' ')
word = array_info[0]
vec = array_info[1:]
if 'name' in vec[1]:
print(word_info.split(' ')[:4])
self.globe_array[word] = vec
def generate_corpus_word2vec(self, corpus_filename):
for word_info in GlobeTransformer.read_file(corpus_filename):
word = ' '.join(word_info.split()[:-1])
word_id = int(word_info.split()[-1])
self.word2id[word] = word_id
def save_corpus_emb(self, emb_filename):
def rand_emb():
return np.random.random_sample(300).tolist()
emb_array = ['']*len(self.word2id)
emb_size = 300
for word, word_id in self.word2id.items():
if word == '<pad>':
emb_array[0] = ' '.join(['0']*emb_size)
continue
if word in self.globe_array:
emb_array[word_id] = ' '.join(self.globe_array[word])
else:
emb_array[word_id] = ' '.join(list(map(str, rand_emb())))
print(len(emb_array))
GlobeTransformer.write_emb_file(emb_array, emb_filename)
@staticmethod
def write_emb_file(emb_array, filename):
with open(filename, 'w', encoding='utf-8') as writter:
for line in emb_array:
writter.write(line+'\n')
if __name__ == '__main__':
globe = 'glove.840B.300d.txt'
source = 'ACE'
corpus_word2id = os.path.join(source, 'word_class_id.txt')
save_filename = os.path.join(source, 'word_embedding.txt')
GlobeTransformer(globe, corpus_word2id).save_corpus_emb(save_filename) |
这个跟论文里面里面 使用模型训练获取enbedding 的不太一样吧 |
代码里面也没包含其他几个文件的生成方式 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
如题 这块是否可用支持新的数据集
The text was updated successfully, but these errors were encountered: