In [1]:
import numpy as np
import pandas as pd
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format('word2vec.bin', binary=True)

In [3]:
data = pd.read_csv('656_dataset_fin.csv')

In [3]:
MFT_data=pd.read_csv('MFT_dataset.csv')
LFT_data=pd.read_csv('LFT_dataset.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,sentence_ID,entity_name,fine_grain,sentence,coarse_grain
0,0,15,European Union,"['/people/ethnicity', '/location/country', '/l...",112 Emergency phone number across the European...,/people
1,1,15,European Union,"['/people/ethnicity', '/location/country', '/l...",112 Emergency phone number across the European...,/location
2,2,15,European Union,"['/people/ethnicity', '/location/country', '/l...",112 Emergency phone number across the European...,/person
3,3,44,Stockton and Darlington Railway,"['/rail/railway', '/organization', '/organizat...",The first 0-4-0 to use coupling rods was Locom...,/rail
4,4,44,Stockton and Darlington Railway,"['/rail/railway', '/organization', '/organizat...",The first 0-4-0 to use coupling rods was Locom...,/organization


In [4]:
def preprocessing(dataset):
    
    X, y = np.asarray(dataset['sentence']), np.asarray(dataset['coarse_grain'])
    label_map = {cat:index for index,cat in enumerate(np.unique(y))}
    y_prep = np.asarray([label_map[l] for l in y])
    x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in X]
   
    return x_tokenized, y_prep

In [5]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()

# 1. For whole dataset

In [11]:
x_tokenized, y_prep = preprocessing(data)

In [None]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              embedding_matrix = model.wv
             )

  after removing the cwd from sys.path.


In [None]:
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])

In [None]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=50)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

In [None]:
x_comps = pca_model.transform(x_vecs)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_comps,y_prep,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)

In [None]:
svm_classifier.score(x_test,y_test)

# 2. For MFT

In [None]:
x_tokenized, y_prep = preprocessing(MFT_data)

In [None]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              embedding_matrix = model.wv
             )

In [None]:
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])

In [None]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=50)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

In [None]:
x_comps = pca_model.transform(x_vecs)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_comps,y_prep,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)

In [None]:
svm_classifier.score(x_test,y_test)

# 3. For LFT

word2vec embeddings for the sentences

In [63]:
x_tokenized, y_prep = preprocessing(LFT_data)

In [64]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 120,
              seq_len = 15,
              embedding_matrix = model.wv
             )

  after removing the cwd from sys.path.


In [65]:
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])

In [66]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=50)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.3631605459465171


In [67]:
x_comps = pca_model.transform(x_vecs)

word2vec embeddings for the entities

In [68]:
Entities = np.asarray(LFT_data['entity_name'])
e_tokenized = [[w for w in e.split(" ") if w != ""] for e in Entities]

In [69]:
entity_sequencer = Sequencer(all_words = [token for seq in e_tokenized for token in seq],
              max_words = 10,
              seq_len = 5,
              embedding_matrix = model.wv
             )

  after removing the cwd from sys.path.


In [70]:
e_vecs = np.asarray([entity_sequencer.textToVector(" ".join(seq)) for seq in e_tokenized])

In [81]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=50)
pca_model.fit(e_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9187674098513777


In [82]:
e_comps = pca_model.transform(e_vecs)

concatenate sentence embeddings + entity embeddings

In [73]:
x_total = np.hstack([e_comps,x_comps])

split the dataset

In [83]:
x_train,x_test,y_train,y_test = train_test_split(e_comps,y_prep,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(82147, 100)
(20537, 100)
(82147,)
(20537,)


SVM model

In [84]:
svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)

SVC()

In [85]:
svm_classifier.score(x_test,y_test)

0.1741247504504066