In [1]:
import numpy as np
import pandas as pd
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
from glove import Corpus, Glove

In [3]:
model = Glove.load('../../Dataset/glove.bin')

In [48]:
data = pd.read_csv('../../Dataset/global_database_figer.csv')

In [5]:
MFT_data=pd.read_csv('../../Dataset/global_MFT_dataset.csv')
LFT_data=pd.read_csv('../../Dataset/global_LFT_dataset.csv')

In [9]:
len(MFT_data)

2610156

In [6]:
LFT_data

Unnamed: 0.1,Unnamed: 0,sentence_ID,entity_name,fine_grain,sentence,global coarse grained
0,65,44,Locomotion No. 1,['/train'],The first 0-4-0 to use coupling rods was Locom...,train
1,67,44,Stockton and Darlington Railway,"['/rail/railway', '/organization', '/organizat...",The first 0-4-0 to use coupling rods was Locom...,rail
2,86,53,East Coast Main Trunk Railway,"['/rail/railway', '/location', '/rail']","One such locomotive , built by Peckett and Son...",rail
3,90,56,Midland Railway 2228 Class,['/train'],"Examples have included the LSWR O2 Class , Mid...",train
4,91,56,LSWR M7 Class,['/train'],"Examples have included the LSWR O2 Class , Mid...",train
5,92,56,Caledonian Railway 439 Class,['/train'],"Examples have included the LSWR O2 Class , Mid...",train
6,93,57,Midland Railway 2228 Class,['/train'],The last British design of 0-4-4T were the LMS...,train
7,108,64,Curt Gowdy Media Award,['/award'],His work has appeared in the Best American Spo...,award
8,119,71,4-6-2T,['/train'],This three-cylindered pattern had begun with H...,train
9,120,71,4-4-4T,['/train'],This three-cylindered pattern had begun with H...,train


In [14]:
def preprocessing(dataset):
    
    X, y = np.asarray(dataset['sentence']), np.asarray(dataset['global coarse grained'])
    label_map = {cat:index for index,cat in enumerate(np.unique(y))}
    y_prep = np.asarray([label_map[l] for l in y])
    x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in X]
   
    return x_tokenized, y_prep

In [8]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 model
                ):
        
        self.seq_len = seq_len
        self.glove_model = model
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                dict_index = self.glove_model.dictionary[tok]
                vec.append(self.glove_model.word_vectors[dict_index])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(5,))
        
        return np.asarray(vec).flatten()

# 1. LFT

In [17]:
x_tokenized, y_prep = preprocessing(LFT_data)

In [19]:
Entities = np.asarray(LFT_data['entity_name'])
e_tokenized = [[w for w in str(e).split(" ") if w != ""] for e in Entities]

In [20]:
entity_sequencer = Sequencer(all_words = [token for seq in e_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              model = model
             )

In [22]:
e_vecs = np.asarray([entity_sequencer.textToVector(" ".join(seq)) for seq in e_tokenized])

In [24]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=50)
pca_model.fit(e_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9979625196690795


In [25]:
e_comps = pca_model.transform(e_vecs)

In [26]:
x_train,x_test,y_train,y_test = train_test_split(e_comps,y_prep,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(64104, 50)
(16026, 50)
(64104,)
(16026,)


In [27]:
svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [28]:
y_pred = svm_classifier.predict(x_test)

In [29]:
from sklearn.metrics import f1_score

In [30]:
f1_score(y_test, y_pred, average='macro')

  'precision', 'predicted', average, warn_for)


0.06481030370019449

In [31]:
f1_score(y_test, y_pred, average='micro')

0.14389117683763883

In [32]:
f1_score(y_test, y_pred, average='weighted')

0.10286950160921395

# 2. MFT

In [33]:
MFT_data = MFT_data.sample(n=80000)

In [34]:
x_tokenized, y_prep = preprocessing(MFT_data)

In [35]:
Entities = np.asarray(MFT_data['entity_name'])
e_tokenized = [[w for w in str(e).split(" ") if w != ""] for e in Entities]

In [36]:
entity_sequencer = Sequencer(all_words = [token for seq in e_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              model = model
             )

In [37]:
e_vecs = np.asarray([entity_sequencer.textToVector(" ".join(seq)) for seq in e_tokenized])

In [38]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=50)
pca_model.fit(e_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9991309865111957


In [39]:
e_comps = pca_model.transform(e_vecs)

In [40]:
x_train,x_test,y_train,y_test = train_test_split(e_comps,y_prep,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(64000, 50)
(16000, 50)
(64000,)
(16000,)


In [41]:
svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [42]:
y_pred = svm_classifier.predict(x_test)

In [43]:
from sklearn.metrics import f1_score

In [44]:
f1_score(y_test, y_pred, average='macro')

  'precision', 'predicted', average, warn_for)


0.05122359929729358

In [45]:
f1_score(y_test, y_pred, average='micro')

0.37431249999999994

In [46]:
f1_score(y_test, y_pred, average='weighted')

0.24296450411253256

# 3. All data

In [49]:
data = data.sample(n=80000)

In [50]:
x_tokenized, y_prep = preprocessing(data)

In [51]:
Entities = np.asarray(data['entity_name'])
e_tokenized = [[w for w in e.split(" ") if w != ""] for e in Entities]

In [52]:
entity_sequencer = Sequencer(all_words = [token for seq in e_tokenized for token in seq],
              max_words = 2000,
              seq_len = 15,
              model=model
             )

In [53]:
e_vecs = np.asarray([entity_sequencer.textToVector(" ".join(seq)) for seq in e_tokenized])

In [54]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=50)
pca_model.fit(e_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9988945537501375


In [55]:
e_comps = pca_model.transform(e_vecs)

In [56]:
x_train,x_test,y_train,y_test = train_test_split(e_comps,y_prep,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(64000, 50)
(16000, 50)
(64000,)
(16000,)


In [57]:
svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [58]:
y_pred = svm_classifier.predict(x_test)

In [59]:
from sklearn.metrics import f1_score

In [60]:
f1_score(y_test, y_pred, average='macro')

  'precision', 'predicted', average, warn_for)


0.027302066823949867

In [61]:
f1_score(y_test, y_pred, average='micro')

0.36175

In [62]:
f1_score(y_test, y_pred, average='weighted')

0.23436698492290417

## similarity check

In [11]:
model.most_similar('rail')

[('flows', 0.9969630371257711),
 ('small', 0.9962222153819585),
 ('cities', 0.9961364715164609),
 ('arterial', 0.9960630906625995)]

In [5]:
model.most_similar('internet')

[('solely', 0.9988180636450562),
 ('wrestlers', 0.9986891198539983),
 ('pull', 0.998408863728595),
 ('property', 0.9982836630891359)]

In [6]:
model.most_similar('biology')

[('motorsport', 0.9975584029264936),
 ('1402', 0.9964067786941028),
 ('crafts', 0.9955452556627562),
 ('7-3', 0.9952782237897274)]

In [7]:
model.most_similar('transit')

[('caribbean', 0.9978722331539843),
 ('nigmatullin', 0.9965894879510891),
 ('mainland', 0.9964540417420519),
 ('nearby', 0.9952109594485312)]

In [8]:
model.most_similar('broadcast')

[('aground', 0.9985774845738553),
 ('news', 0.9984539884976539),
 ('sixes', 0.9976641848708611),
 ('apiece', 0.9966718402546354)]

In [12]:
model.most_similar('newspaper')

[('privately', 0.9991001772717363),
 ('armed', 0.9990183672981497),
 ('terms', 0.9989140538581939),
 ('practice', 0.9986981883268019)]

In [13]:
model.most_similar('game')

[('annual', 0.9994483113795459),
 ('gr-33', 0.9989416045508036),
 ('fifth', 0.998240383849126),
 ('eighth', 0.9977599075306751)]