In [1]:
# !pip install transformers emoji nltk

# import packages
import pickle
import os
import json
import numpy as np
import pandas as pd
import re
import time
from multiprocessing import Pool

In [2]:
GLOVE_DIR = '../../glove'
Glove_path = os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')
print(Glove_path)

embeddings_index = dict()
f = open(Glove_path, encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        print("Warnning"+str(values)+" in" + str(line))
    embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' % len(embeddings_index))

../../glove/glove.6B.300d.txt
Total 400000 word vectors.


In [8]:
dataset_name = 'wos46985'
base_dir = '../../data/WOS/'
data_file = base_dir+'Meta-data/Data.csv'

In [9]:
df = pd.read_csv(data_file)
print('Num Data:',len(df))

Num Data: 46985


In [10]:
df.head(2)

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,"""(2 + 1)-dimensional non-linear optical waves ..."
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,"""(beta-amyloid (A beta) and tau pathology beco..."


In [34]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def get_glove_embeddings(texts, method = 'pool'):
    output_reps_pool = []

    for step,text in enumerate(texts):
        # progress update after every 1000 texts.
        if step % 1000 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(texts)))

        base_output = []
        for word in text.split():
            try:
                base_output.append(embeddings_index[word])
            except:
                pass
        
        if len(base_output)>0:
            if method=='pool':
                # Mean Pool
                output_representation = np.mean(base_output, axis=0)
            elif method=='cls':
                output_representation = base_output[0]
            output_reps_pool.append(output_representation)
        else:
            output_reps_pool.append(np.zeros(embeddings_index['a'].shape))
        
    output_reps_df = np.vstack(output_reps_pool)
    print('Embeddings Shape:',output_reps_df.shape)
    
    return output_reps_df

In [35]:
method = 'pool'
text_embedding_dir = '../data/'+dataset_name

In [36]:
texts = df.Abstract.apply(clean_str).tolist()

text_embeddings = get_glove_embeddings(texts, method)
text_embedding_file = text_embedding_dir+'/glove300-embedding-'+method+'.pkl'

if text_embeddings.shape[0] == len(texts):
    print('Saving Embeddings...')
    pickle.dump(text_embeddings, open(text_embedding_file, "wb"))

  Batch 1,000  of  46,985.
  Batch 2,000  of  46,985.
  Batch 3,000  of  46,985.
  Batch 4,000  of  46,985.
  Batch 5,000  of  46,985.
  Batch 6,000  of  46,985.
  Batch 7,000  of  46,985.
  Batch 8,000  of  46,985.
  Batch 9,000  of  46,985.
  Batch 10,000  of  46,985.
  Batch 11,000  of  46,985.
  Batch 12,000  of  46,985.
  Batch 13,000  of  46,985.
  Batch 14,000  of  46,985.
  Batch 15,000  of  46,985.
  Batch 16,000  of  46,985.
  Batch 17,000  of  46,985.
  Batch 18,000  of  46,985.
  Batch 19,000  of  46,985.
  Batch 20,000  of  46,985.
  Batch 21,000  of  46,985.
  Batch 22,000  of  46,985.
  Batch 23,000  of  46,985.
  Batch 24,000  of  46,985.
  Batch 25,000  of  46,985.
  Batch 26,000  of  46,985.
  Batch 27,000  of  46,985.
  Batch 28,000  of  46,985.
  Batch 29,000  of  46,985.
  Batch 30,000  of  46,985.
  Batch 31,000  of  46,985.
  Batch 32,000  of  46,985.
  Batch 33,000  of  46,985.
  Batch 34,000  of  46,985.
  Batch 35,000  of  46,985.
  Batch 36,000  of  46,985.
 