In [19]:
import pandas as pd
import numpy as np

# 1.1. Word Embedding

In [2]:
import gensim.downloader

In [3]:
# Show all available models in gensim-data
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [5]:
# Download the "glove-twitter-25" embeddings
glove_vectors = gensim.downloader.load('word2vec-google-news-300')

In [6]:
# retrieve the vector for 'computer'
glove_vectors['computer'] 

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

## Question 1.1
use cosine similarity to find the most similar 
word to each of these worsds

In [7]:
# Use the downloaded vectors as usual:
glove_vectors.most_similar('student')

[('students', 0.7294867038726807),
 ('Student', 0.6706662774085999),
 ('teacher', 0.6301366090774536),
 ('stu_dent', 0.6240993142127991),
 ('faculty', 0.6087332963943481),
 ('school', 0.6055627465248108),
 ('undergraduate', 0.6020305752754211),
 ('university', 0.600540041923523),
 ('undergraduates', 0.5755698680877686),
 ('semester', 0.573759913444519)]

In [8]:
# Use the downloaded vectors as usual:
glove_vectors.most_similar('Apple')

[('Apple_AAPL', 0.7456986308097839),
 ('Apple_Nasdaq_AAPL', 0.7300410270690918),
 ('Apple_NASDAQ_AAPL', 0.717508852481842),
 ('Apple_Computer', 0.7145972847938538),
 ('iPhone', 0.6924266219139099),
 ('Apple_NSDQ_AAPL', 0.6868603229522705),
 ('Steve_Jobs', 0.6758421659469604),
 ('iPad', 0.6580768823623657),
 ('Apple_nasdaq_AAPL', 0.6444970369338989),
 ('AAPL_PriceWatch_Alert', 0.6439753174781799)]

In [9]:
# Use the downloaded vectors as usual:
glove_vectors.most_similar('apple')

[('apples', 0.720359742641449),
 ('pear', 0.6450697183609009),
 ('fruit', 0.6410146355628967),
 ('berry', 0.6302295327186584),
 ('pears', 0.613396167755127),
 ('strawberry', 0.6058260798454285),
 ('peach', 0.6025872826576233),
 ('potato', 0.5960935354232788),
 ('grape', 0.5935863852500916),
 ('blueberry', 0.5866668224334717)]

# Data

In [11]:
CoNLL2003_dir = '/mnt/lustre/yuxin/SC4002_G06/datasets/CoNLL2003'
train_dir = f'{CoNLL2003_dir}/eng.train'
dev_dir =  f'{CoNLL2003_dir}/eng.testa'
test_dir =  f'{CoNLL2003_dir}/eng.testb'

In [12]:
def import_content(path):
    try:
        with open(path, 'r') as file:
            content = file.readlines()
        file.close()
    except Exception as e:
        content = None
        print(e)
    
    return content

def print_items(item):
    for s in item: print(s)

In [13]:
train_content = import_content(train_dir)
dev_content = import_content(dev_dir)
test_content = import_content(test_dir)

## Split data by senetence

In [14]:
def split_sentences(content):
    split_data = [c.split(' ') for c in content] if content != None else []
    sentences = []
    sentence = []
    words = []

    for line in split_data:
        # if end of a sentence
        if line == ['\n']:
            sentences.append(sentence)
            sentence = []
        else:
            s_text  = line[0]
            s_tag = line[-1].replace('\n','')

            sentence.append([s_text, s_tag]) 
            words.append([s_text, s_tag])
    
    sentences.append(sentence) # last item in content not new line so must add previous sentence manually after loop           

    return sentences, words

In [15]:
def split_text_tag(sentences):
    text = []
    tag = []
    combined = []
    sentence_count = 1

    for s in sentences:
        for w in s:
            w_text  = w[0]
            w_tag = w[-1].replace('\n','')

            text.append(w_text)
            tag.append(w_tag)        
            combined.append({
                'sentence': sentence_count,
                'text' : w_text,
                'tag' : w_tag
            })   
        sentence_count+=1       
    return text, tag, combined

In [16]:
train_sentences, train_words = split_sentences(train_content)
dev_sentences, dev_words = split_sentences(dev_content)
test_sentences, test_words = split_sentences(test_content)

train_text, train_tag, train_combined = split_text_tag(train_sentences)
dev_text, dev_tag, dev_combined = split_text_tag(dev_sentences)
test_text, test_tag, test_combined = split_text_tag(test_sentences)

In [20]:
train_voc = np.unique(np.array(train_text))
dev_voc = np.unique(np.array(dev_text))


tag_set = np.unique(np.array(train_tag))

In [21]:
print("Number of sentences (training):", len(train_sentences))
print("Number of sentences (dev):", len(dev_sentences))
print("Number of sentences (test):", len(test_sentences))

Number of sentences (training): 14987
Number of sentences (dev): 3466
Number of sentences (test): 3684


In [22]:
print("Tag set (BIO):", tag_set)

Tag set (BIO): ['B-LOC' 'B-MISC' 'B-ORG' 'I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


In [23]:
def get_multiple_ne_sentence(sentences):
    for sentence in sentences:
        ne_count = 0
        for word_info in sentence:
            if "B-" in word_info[-1]:
                ne_count+=1
        if ne_count == 2:
            return sentence
    return None        

In [24]:
sentence = get_multiple_ne_sentence(train_sentences)
sentence

[['Swiss', 'I-MISC'],
 ['Grand', 'B-MISC'],
 ['Prix', 'I-MISC'],
 ['World', 'B-MISC'],
 ['Cup', 'I-MISC'],
 ['cycling', 'O'],
 ['race', 'O'],
 ['on', 'O'],
 ['Sunday', 'O'],
 [':', 'O']]

In [25]:
def get_named_entities(sentence):
    inside_tags = ['I-ORG', 'I-LOC', 'I-PER', 'I-MISC'] # Tags that require multiple words to form an entity
    begin_tags = ['B-LOC', 'B-ORG', 'B-MISC'] # Tags that are single word entities
    outside_tags = ['O']
    entities = [] # all entities gotten from search
    entity = [] # word group of current entity if any group tags encountered
    
    for c in sentence:
        if (c['tag'] in begin_tags or c['tag'] in outside_tags or c['tag'] == '\n') and len(entity) != 0:
            entities.append(' '.join(entity))
            entity = []
        if c['tag'] in begin_tags or c['tag'] in inside_tags: 
            entity.append(c['text'])

    return entities

In [26]:
_,_,sentence_text_tag = split_text_tag([sentence])
print("Complete named entities in the sentence:", get_named_entities(sentence_text_tag))

Complete named entities in the sentence: ['Swiss', 'Grand Prix', 'World Cup']


In [31]:
train_df = pd.DataFrame(train_combined)
dev_df = pd.DataFrame(dev_combined)
test_df = pd.DataFrame(test_combined)

# path = '../Datasets/Processed/'
# file_name = 'CoNLL2003_processed'
# # Export DataFrame to a CSV file
# df.to_csv(f'{path}{file_name}.csv', index=False)

# Model

In [81]:
type(glove_vectors)

gensim.models.keyedvectors.KeyedVectors

In [83]:
print(dir(glove_vectors))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_log_evaluate_word_analogies', '_save_specials', '_smart_save', '_upconvert_old_d2vkv', '_upconvert_old_vocab', 'add_lifecycle_event', 'add_vector', 'add_vectors', 'allocate_vecattrs', 'closer_than', 'cosine_similarities', 'distance', 'distances', 'doesnt_match', 'evaluate_word_analogies', 'evaluate_word_pairs', 'expandos', 'fill_norms', 'get_index', 'get_mean_vector', 'get_normed_vectors', 'get_vecattr', 'get_vector', 'has_index_for', 'index2entity', 'index2word', 'index_to_key', 'init_sims', 'intersect_word2vec_format', 'key_to_index', 'lifecycle_even

In [76]:
glove_vectors.key_to_index

{'</s>': 0,
 'in': 1,
 'for': 2,
 'that': 3,
 'is': 4,
 'on': 5,
 '##': 6,
 'The': 7,
 'with': 8,
 'said': 9,
 'was': 10,
 'the': 11,
 'at': 12,
 'not': 13,
 'as': 14,
 'it': 15,
 'be': 16,
 'from': 17,
 'by': 18,
 'are': 19,
 'I': 20,
 'have': 21,
 'he': 22,
 'will': 23,
 'has': 24,
 '####': 25,
 'his': 26,
 'an': 27,
 'this': 28,
 'or': 29,
 'their': 30,
 'who': 31,
 'they': 32,
 'but': 33,
 '$': 34,
 'had': 35,
 'year': 36,
 'were': 37,
 'we': 38,
 'more': 39,
 '###': 40,
 'up': 41,
 'been': 42,
 'you': 43,
 'its': 44,
 'one': 45,
 'about': 46,
 'would': 47,
 'which': 48,
 'out': 49,
 'can': 50,
 'It': 51,
 'all': 52,
 'also': 53,
 'two': 54,
 'after': 55,
 'first': 56,
 'He': 57,
 'do': 58,
 'time': 59,
 'than': 60,
 'when': 61,
 'We': 62,
 'over': 63,
 'last': 64,
 'new': 65,
 'other': 66,
 'her': 67,
 'people': 68,
 'into': 69,
 'In': 70,
 'our': 71,
 'there': 72,
 'A': 73,
 'she': 74,
 'could': 75,
 'just': 76,
 'years': 77,
 'some': 78,
 'U.S.': 79,
 'three': 80,
 'million': 81

In [66]:
#Load w2v models for train and dev

from gensim.models import Word2Vec
path = '../Pretrained_Models/'

train_w2v = Word2Vec.load('/mnt/lustre/yuxin/SC4002_G06/pretrained_models/CoNLL2003_pretrain.model')

train_pretrained_weights = train_w2v.wv.vectors
train_num_tokens, train_embedding_dim = train_pretrained_weights.shape

word2idx = train_w2v.wv.key_to_index
word2idx['<UNK>'] = word2idx[list(word2idx.keys())[-1]]+1
word2idx['<PAD>'] = word2idx[list(word2idx.keys())[-1]]+1
voc = train_voc
voc = np.append(voc,'<UNK>')
voc = np.append(voc,'<PAD>')

tag2idx = {k: v for v, k in enumerate(tag_set)}

In [67]:
w2v_word2idx = dict(glove_vectors.key_to_index)
w2v_voc = glove_vectors.index_to_key
w2v_word2idx['<UNK>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1
w2v_word2idx['<PAD>'] = w2v_word2idx[list(w2v_word2idx.keys())[-1]]+1

In [68]:
embeddings_index = {}
for v in w2v_voc:
  embeddings_index[v] =  glove_vectors[v] 

In [70]:
num_tokens = len(voc) + 2
embedding_dim = 50
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word2idx.items():    
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector[0:50]
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 17724 words (5902 misses)


# MOdel

In [86]:
# from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.nn.utils.rnn import pad_sequence

ModuleNotFoundError: No module named 'torch'

In [84]:
embedding_dim

50

In [72]:
def get_x_sequence(sentences):
  sequence = []
  sent_seq = []
  for s in sentences:
    for w in s:
      if w[0] in word2idx.keys():
        sent_seq.append(word2idx[w[0]])
      else:
        sent_seq.append(word2idx['<UNK>'])
    sequence.append(sent_seq)
    sent_seq = []
  
  return sequence

In [85]:
x_dev = get_x_sequence(dev_sentences)
x_dev = pad_sequence(dev_sentences)

NameError: name 'pad_sequence' is not defined

In [42]:
x_train = get_x_sequence(train_sentences)
#x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post", value=len(word2idx)-1)
x_train = pad_sequences(maxlen=embedding_dim, sequences=x_train, padding="post")

y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post", value=tag2idx['O'])
#y_train = pad_sequences(maxlen=embedding_dim, sequences=y_train, padding="post")

x_dev = get_x_sequence(dev_sentences)
#x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post", value=len(word2idx)-1)
x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post")

y_dev = [[tag2idx[w[1]] for w in s] for s in dev_sentences]
y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post", value=tag2idx['O'])
#y_dev = pad_sequences(maxlen=embedding_dim, sequences=y_dev, padding="post")

x_test = get_x_sequence(test_sentences)
#x_dev = pad_sequences(maxlen=embedding_dim, sequences=x_dev, padding="post", value=len(word2idx)-1)
x_test = pad_sequences(maxlen=embedding_dim, sequences=x_test, padding="post")

y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]
y_test = pad_sequences(maxlen=embedding_dim, sequences=y_test, padding="post", value=tag2idx['O'])
#y_test = pad_sequences(maxlen=embedding_dim, sequences=y_test, padding="post")

y_train = to_categorical(y_train)
y_dev = to_categorical(y_dev)
y_test = to_categorical(y_test)

NameError: name 'pad_sequences' is not defined