In [1]:
#importing libraries

import gensim
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#Loading pre=trained word2vec model

from gensim.models.keyedvectors import KeyedVectors

# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory

model_path = 'D:\GoogleNews_vectors_negative300\GoogleNews_vectors_negative300.bin'   
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)



In [3]:
class DocSim(object):
    def __init__(self, w2v_model , stopwords=[]):
        self.w2v_model = w2v_model
        self.stopwords = stopwords
        
    def vectorize(self, doc):
        """Identify the vector values for each word in the given document"""
        doc = doc.lower()
        words = [w for w in doc.split(" ") if w not in self.stopwords]
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors

        vector = np.mean(word_vecs, axis=0)
        return vector

In [6]:
#Printing EA rulebook (Target rules)

ds = DocSim(w2v_model)

rules = 'Controls - railway workforce (Communication between train drivers and signallers)'
if isinstance(rules, str):
    rules = [rules]
    x1 = ds.vectorize(" ".join(rules))
    print(x1)            #print 300 cross sectional array/matrix
    print(type(x1[0]))   # printing array/matrix value type
    y= x1*10000     #To remove the decimal point
    main_vec_EA=y.astype(int)       #Change the type from float to int
    print(main_vec_EA.shape)      # printing array/matrix shape and size
    print(main_vec_EA)

[ 0.02339172  0.02010091  0.06585693 -0.03441111  0.09124756 -0.10293579
 -0.07889811 -0.10378011  0.0920817   0.01464844  0.00240072  0.01822917
 -0.07572428 -0.14501953 -0.11566162 -0.00815837 -0.04736328  0.13625081
 -0.04241943 -0.07055664  0.01753744 -0.05456543 -0.03597005  0.09628805
  0.14408366  0.01734416 -0.20500183  0.09184774  0.06363932 -0.03224691
  0.00585938 -0.18469238 -0.1727473  -0.07328796 -0.00028483 -0.06681315
 -0.04833984  0.07373047  0.03151448  0.05377197 -0.17834473 -0.02144369
  0.07307943 -0.04779943 -0.05423991 -0.11789831 -0.03458722  0.04952494
 -0.0493571   0.05451457 -0.04964193  0.06453451  0.03358968  0.04689534
 -0.07995097 -0.03137207 -0.09806315  0.07212321 -0.08229574  0.03216553
 -0.04311116 -0.1008606  -0.16357422 -0.12894695 -0.00292714  0.11335246
 -0.1130778   0.14274089 -0.02119954  0.04886882 -0.09310404  0.10616048
  0.02415975 -0.02252197 -0.06388346 -0.11421839  0.05508423  0.06351725
 -0.13041179 -0.03181966  0.04069011 -0.05126953  0

In [8]:
#printing arrays for testbench in vivadeo

for index, value in enumerate(main_vec_EA):
    print(f"in_array1({index}) <= {value};")

in_array1(0) <= 233;
in_array1(1) <= 201;
in_array1(2) <= 658;
in_array1(3) <= -344;
in_array1(4) <= 912;
in_array1(5) <= -1029;
in_array1(6) <= -788;
in_array1(7) <= -1037;
in_array1(8) <= 920;
in_array1(9) <= 146;
in_array1(10) <= 24;
in_array1(11) <= 182;
in_array1(12) <= -757;
in_array1(13) <= -1450;
in_array1(14) <= -1156;
in_array1(15) <= -81;
in_array1(16) <= -473;
in_array1(17) <= 1362;
in_array1(18) <= -424;
in_array1(19) <= -705;
in_array1(20) <= 175;
in_array1(21) <= -545;
in_array1(22) <= -359;
in_array1(23) <= 962;
in_array1(24) <= 1440;
in_array1(25) <= 173;
in_array1(26) <= -2050;
in_array1(27) <= 918;
in_array1(28) <= 636;
in_array1(29) <= -322;
in_array1(30) <= 58;
in_array1(31) <= -1846;
in_array1(32) <= -1727;
in_array1(33) <= -732;
in_array1(34) <= -2;
in_array1(35) <= -668;
in_array1(36) <= -483;
in_array1(37) <= 737;
in_array1(38) <= 315;
in_array1(39) <= 537;
in_array1(40) <= -1783;
in_array1(41) <= -214;
in_array1(42) <= 730;
in_array1(43) <= -477;
in_array1(44)

In [27]:
#printing arrays for testbench in vivadeo

for index, value in enumerate(main_vec_EA):
    print(f"S1_vec({index}) <= {value};")

S1_vec(0) <= 233;
S1_vec(1) <= 201;
S1_vec(2) <= 658;
S1_vec(3) <= -344;
S1_vec(4) <= 912;
S1_vec(5) <= -1029;
S1_vec(6) <= -788;
S1_vec(7) <= -1037;
S1_vec(8) <= 920;
S1_vec(9) <= 146;
S1_vec(10) <= 24;
S1_vec(11) <= 182;
S1_vec(12) <= -757;
S1_vec(13) <= -1450;
S1_vec(14) <= -1156;
S1_vec(15) <= -81;
S1_vec(16) <= -473;
S1_vec(17) <= 1362;
S1_vec(18) <= -424;
S1_vec(19) <= -705;
S1_vec(20) <= 175;
S1_vec(21) <= -545;
S1_vec(22) <= -359;
S1_vec(23) <= 962;
S1_vec(24) <= 1440;
S1_vec(25) <= 173;
S1_vec(26) <= -2050;
S1_vec(27) <= 918;
S1_vec(28) <= 636;
S1_vec(29) <= -322;
S1_vec(30) <= 58;
S1_vec(31) <= -1846;
S1_vec(32) <= -1727;
S1_vec(33) <= -732;
S1_vec(34) <= -2;
S1_vec(35) <= -668;
S1_vec(36) <= -483;
S1_vec(37) <= 737;
S1_vec(38) <= 315;
S1_vec(39) <= 537;
S1_vec(40) <= -1783;
S1_vec(41) <= -214;
S1_vec(42) <= 730;
S1_vec(43) <= -477;
S1_vec(44) <= -542;
S1_vec(45) <= -1178;
S1_vec(46) <= -345;
S1_vec(47) <= 495;
S1_vec(48) <= -493;
S1_vec(49) <= 545;
S1_vec(50) <= -496;
S1_vec

In [9]:
#Printing OCD rulebook (Source rules)

ds = DocSim(w2v_model)

rules = '2.8.6 Control communication between train drivers and signallers'
if isinstance(rules, str):
    rules = [rules]
    x2 = ds.vectorize(" ".join(rules))
    print(x2)            #print 300 cross sectional array/matrix
    print(type(x2[0]))   # printing array/matrix value type
    y= x2*10000   #To remove the decimal point
    main_vec_OCD=y.astype(int)       #Change the type from float to int
    print(main_vec_OCD.shape)      # printing array/matrix shape and size
    print(main_vec_OCD)

[ 0.05604858 -0.02294922 -0.01313477 -0.04412537 -0.01813965  0.00272217
 -0.0234375  -0.13850097  0.04418945  0.04067383 -0.01396484  0.02370605
 -0.07692871  0.028125   -0.09794922 -0.00380859  0.02832031  0.15495606
 -0.00844727 -0.13242188  0.06622314  0.02446289 -0.01044922  0.09406128
  0.11127929  0.02745361 -0.20635375  0.11296387  0.14433594  0.0017334
  0.01948242 -0.19467774 -0.09289245 -0.08442383 -0.01699219 -0.02753906
 -0.12201538  0.03203125  0.05363769  0.01936035 -0.0809082   0.02504883
  0.01796875 -0.03476562 -0.12164612 -0.07770844  0.01230392  0.08808593
 -0.08364258  0.09375    -0.09819336  0.1203125   0.04121094 -0.00788574
 -0.11762085  0.13081054 -0.03541565  0.04868164 -0.07855225  0.01037598
 -0.04361572 -0.05717773 -0.10166015 -0.01245117 -0.02868652  0.16502686
 -0.0765625   0.27617186 -0.04543457  0.05219727 -0.07129516  0.09025879
  0.09370117 -0.03271484 -0.07988282 -0.0833023   0.03328858  0.05444336
 -0.15367432  0.05776367  0.08320312  0.00566406  0.

In [26]:
#printing arrays for testbench in vivadeo

for index, value in enumerate(main_vec_OCD):
    print(f"S2_vec({index}) <= {value};")

S2_vec(0) <= 560;
S2_vec(1) <= -229;
S2_vec(2) <= -131;
S2_vec(3) <= -441;
S2_vec(4) <= -181;
S2_vec(5) <= 27;
S2_vec(6) <= -234;
S2_vec(7) <= -1385;
S2_vec(8) <= 441;
S2_vec(9) <= 406;
S2_vec(10) <= -139;
S2_vec(11) <= 237;
S2_vec(12) <= -769;
S2_vec(13) <= 281;
S2_vec(14) <= -979;
S2_vec(15) <= -38;
S2_vec(16) <= 283;
S2_vec(17) <= 1549;
S2_vec(18) <= -84;
S2_vec(19) <= -1324;
S2_vec(20) <= 662;
S2_vec(21) <= 244;
S2_vec(22) <= -104;
S2_vec(23) <= 940;
S2_vec(24) <= 1112;
S2_vec(25) <= 274;
S2_vec(26) <= -2063;
S2_vec(27) <= 1129;
S2_vec(28) <= 1443;
S2_vec(29) <= 17;
S2_vec(30) <= 194;
S2_vec(31) <= -1946;
S2_vec(32) <= -928;
S2_vec(33) <= -844;
S2_vec(34) <= -169;
S2_vec(35) <= -275;
S2_vec(36) <= -1220;
S2_vec(37) <= 320;
S2_vec(38) <= 536;
S2_vec(39) <= 193;
S2_vec(40) <= -809;
S2_vec(41) <= 250;
S2_vec(42) <= 179;
S2_vec(43) <= -347;
S2_vec(44) <= -1216;
S2_vec(45) <= -777;
S2_vec(46) <= 123;
S2_vec(47) <= 880;
S2_vec(48) <= -836;
S2_vec(49) <= 937;
S2_vec(50) <= -981;
S2_vec(51

In [21]:
#printing arrays for testbench in vivadeo

for index, value in enumerate(main_vec_OCD):
    print(f"in_array2({index}) <= {value};")

in_array2(0) <= 560;
in_array2(1) <= -229;
in_array2(2) <= -131;
in_array2(3) <= -441;
in_array2(4) <= -181;
in_array2(5) <= 27;
in_array2(6) <= -234;
in_array2(7) <= -1385;
in_array2(8) <= 441;
in_array2(9) <= 406;
in_array2(10) <= -139;
in_array2(11) <= 237;
in_array2(12) <= -769;
in_array2(13) <= 281;
in_array2(14) <= -979;
in_array2(15) <= -38;
in_array2(16) <= 283;
in_array2(17) <= 1549;
in_array2(18) <= -84;
in_array2(19) <= -1324;
in_array2(20) <= 662;
in_array2(21) <= 244;
in_array2(22) <= -104;
in_array2(23) <= 940;
in_array2(24) <= 1112;
in_array2(25) <= 274;
in_array2(26) <= -2063;
in_array2(27) <= 1129;
in_array2(28) <= 1443;
in_array2(29) <= 17;
in_array2(30) <= 194;
in_array2(31) <= -1946;
in_array2(32) <= -928;
in_array2(33) <= -844;
in_array2(34) <= -169;
in_array2(35) <= -275;
in_array2(36) <= -1220;
in_array2(37) <= 320;
in_array2(38) <= 536;
in_array2(39) <= 193;
in_array2(40) <= -809;
in_array2(41) <= 250;
in_array2(42) <= 179;
in_array2(43) <= -347;
in_array2(44) <

In [13]:
#printing arrays for testbench in vivadeo

for index, value in enumerate(main_vec_OCD):
    print(f"( (in_array1({index}))*(in_array2({index})) )+")

( (in_array1(0))*(in_array2(0)) )+
( (in_array1(1))*(in_array2(1)) )+
( (in_array1(2))*(in_array2(2)) )+
( (in_array1(3))*(in_array2(3)) )+
( (in_array1(4))*(in_array2(4)) )+
( (in_array1(5))*(in_array2(5)) )+
( (in_array1(6))*(in_array2(6)) )+
( (in_array1(7))*(in_array2(7)) )+
( (in_array1(8))*(in_array2(8)) )+
( (in_array1(9))*(in_array2(9)) )+
( (in_array1(10))*(in_array2(10)) )+
( (in_array1(11))*(in_array2(11)) )+
( (in_array1(12))*(in_array2(12)) )+
( (in_array1(13))*(in_array2(13)) )+
( (in_array1(14))*(in_array2(14)) )+
( (in_array1(15))*(in_array2(15)) )+
( (in_array1(16))*(in_array2(16)) )+
( (in_array1(17))*(in_array2(17)) )+
( (in_array1(18))*(in_array2(18)) )+
( (in_array1(19))*(in_array2(19)) )+
( (in_array1(20))*(in_array2(20)) )+
( (in_array1(21))*(in_array2(21)) )+
( (in_array1(22))*(in_array2(22)) )+
( (in_array1(23))*(in_array2(23)) )+
( (in_array1(24))*(in_array2(24)) )+
( (in_array1(25))*(in_array2(25)) )+
( (in_array1(26))*(in_array2(26)) )+
( (in_array1(27))*(in

In [None]:
    outp_reg <= integer(
                               (in_array1(0) * in_array2(0)) +
                               (in_array1(1) * in_array2(1)) +
                               (in_array1(2) * in_array2(2)) +
                               (in_array1(3) * in_array2(3)) +
                               (in_array1(4) * in_array2(4)) +

In [19]:

print(main_vec_EA[0:9])
print(main_vec_OCD[0:9])

MAC1= np.dot(main_vec_EA[0:9],main_vec_OCD[0:9])
print("MAC1=",MAC1)

[  233   201   658  -344   912 -1029  -788 -1037   920]
[  560  -229  -131  -441  -181    27  -234 -1385   441]
MAC1= 1983459


In [None]:
main_vec_EA = [ 560,-229,-131,-441,-181,27,-234,-1385,441]
main_vec_OCD = [ 560,-229,-131,-441,-181,27,-234,-1385,441]

MAC1= np.dot(main_vec_EA,main_vec_OCD)
print("MAC1=",MAC1)

In [20]:
# a=(560*560)+(-229*-229)+(-131*-131)+(-441*-441)+(-181*-181)+(27*27)+(-234*-234)+(-1385*-1385)+(441*441)
a=(560*560)+(-229*-229)+(-131*-131)+(-441*-441)+(-181*-181)+(27*27)+(-234*-234)+(-1385*-1385)+(441*441)
print(a)

2778635


In [31]:
import math

MAC1= np.dot(main_vec_EA,main_vec_OCD)
print("MAC1=",MAC1)


MAC2= np.dot(main_vec_EA,main_vec_EA)
print("MAC2=",MAC2)

MAC3=np.dot(main_vec_OCD,main_vec_OCD)
print("MAC3=",MAC3)

#Converting numbers into 64 bits
MAC1=np.int64(MAC1)
MAC2=np.int64(MAC2)
MAC3=np.int64(MAC3)


MULTI1_OUTP=MAC1*MAC1
MULTI2_OUTP=MAC2*MAC3

print("MULTI1_OUTP=",MULTI1_OUTP)
print("MULTI1_OUTP=",MULTI2_OUTP)

DIV1_inp= MULTI1_OUTP
DIV2_inp= MULTI2_OUTP


print("Cosine Result",DIV1_inp/DIV2_inp)

MAC1= 188239286
MAC2= 243225778
MAC3= 222847541
MULTI1_OUTP= 35434028793789796
MULTI1_OUTP= 54202266535111898
Cosine Result 0.6537370309198389


In [23]:
import math

MAC1= np.dot(main_vec_EA,main_vec_OCD)
print("MAC1=",MAC1)


MAC2= np.dot(main_vec_EA,main_vec_EA)
print("MAC2=",MAC2)

MAC3=np.dot(main_vec_OCD,main_vec_OCD)
print("MAC3=",MAC3)

#Convert all the values into positive numbers
MAC1= abs(MAC1)
MAC2= abs(MAC2)
MAC3= abs(MAC3)



MAC2_sqrt=math.sqrt(MAC2)
print("MAC2 after square root=",MAC2_sqrt)

MAC3_sqrt=math.sqrt(MAC3)
print("MAC3 after square root=",MAC3_sqrt)

MULTI1_OUTP=MAC1
MULTI2_OUTP=MAC2_sqrt*MAC3_sqrt

print("MULTI1_OUTP=",MULTI1_OUTP)
print("MULTI1_OUTP=",MULTI2_OUTP)

DIV1_inp= MULTI1_OUTP
DIV2_inp= MULTI2_OUTP


print("Cosine Result",DIV1_inp/DIV2_inp)

MAC1= 188239286
MAC2= 243225778
MAC3= 222847541
MAC2 after square root= 15595.697419480797
MAC3 after square root= 14928.078945396826
MULTI1_OUTP= 188239286
MULTI1_OUTP= 232813802.2865309
Cosine Result 0.8085400614192464


In [None]:
MAC1= np.dot(main_vec_EA, main_vec_OCD)
print("MAC1=",MAC1)

MAC2= np.linalg.norm(main_vec_EA)
print("MAC2=",MAC2)

MAC3= np.linalg.norm(main_vec_OCD)
print("MAC3=",MAC3)


# Y=MAC2*MAC3
# print(Y)


# X=MAC1/Y
# print("Result",X)


MULTI1_OUTP=MAC1
MULTI2_OUTP=MAC2*MAC3

print("MULTI1_OUTP=",MULTI1_OUTP)
print("MULTI1_OUTP=",MULTI2_OUTP)

DIV1_inp= MULTI1_OUTP
DIV2_inp= MULTI2_OUTP


print("Cosine Result",DIV1_inp/DIV2_inp)

In [None]:
def cosine_sim(main_vec_EA, main_vec_OCD):
    """Find the cosine similarity distance between two vectors."""
    csim=( np.dot(main_vec_EA, main_vec_OCD) )/ (np.linalg.norm(main_vec_EA)*np.linalg.norm(main_vec_OCD))
    if np.isnan(np.sum(csim)):
        return 0
    return csim

cosine_sim(main_vec_EA, main_vec_OCD)

In [None]:
import numpy as np

a = [7,4,1,1]

print(a)

# computing norm for a matrix
print(np.linalg.norm(a))

In [None]:
MAC1= np.dot(main_vec_EA,main_vec_OCD)
MAC1= MAC1/1000000
print("MAC1",MAC1)

In [None]:
MAC2= np.dot(main_vec_EA,main_vec_EA)
MAC2= MAC2/1000000
print("MAC2",MAC2)

In [None]:
MAC3=np.dot(main_vec_OCD,main_vec_OCD)
MAC3= MAC3/1000000
print("MAC3",MAC3)

In [None]:
MULTI1_OUTP=MAC1*MAC1
MULTI2_OUTP=MAC2*MAC3

print("MULTI1_OUTP",MULTI1_OUTP)
print("MULTI1_OUTP",MULTI2_OUTP)

In [None]:
DIV1_inp= MULTI1_OUTP
DIV2_inp= MULTI2_OUTP


print("Cosine Result",DIV2_inp/DIV1_inp)

In [None]:
def cosine_sim(main_vec_EA, main_vec_OCD):
    """Find the cosine similarity distance between two vectors."""
    csim=( np.dot(main_vec_EA, main_vec_OCD) )/ (np.linalg.norm(main_vec_EA)*np.linalg.norm(main_vec_OCD))
    if np.isnan(np.sum(csim)):
        return 0
    return csim

cosine_sim(main_vec_EA, main_vec_OCD)

In [None]:
# from scipy.linalg import norm
# x1=[1.1,2,3,4]
# x2=[-2,5,-3,7]

# # 1+2+3+4=10
# # # 2+5+3+7=17

# def jaccard_similarity(x1,x2):
#     vectors_minimum=np.minimum(x1,x2)
#     vectors_maximum=np.maximum(x1,x2)
#     jaccard_sim=norm(vectors_minimum)/norm(vectors_maximum)
#     return jaccard_sim

# jaccard_similarity(x1,x2)

In [None]:

# x1=[1.1,2,3,4]
# x2=[-2,5,-3,7]

# # # 1+2+3+4=10
# # # 2+5+3+7=17

# def jaccard_similarity(vecA, vecB):
#     vectors_minimum=np.minimum(vecA,vecB)
#     vectors_maximum=np.maximum(vecA,vecB)
#     jaccard_sim=np.linalg.norm(vectors_minimum)/np.linalg.norm(vectors_maximum)
#     return jaccard_sim

# jaccard_similarity(x1,x2)

In [None]:
#  x1=[4,2,1,1]
#  x2=[7,2,1,1]

# def jaccard_sim(x1, x2):
#         """Find the cosine similarity distance between two vectors."""
#         num=np.dot(x1,x2)
#         denum1=((np.linalg.norm(x1))**2)+((np.linalg.norm(x2))**2)
#         denum2=np.dot(x1,x2)
#         jsim=num/(denum1-denum2)
#         if np.isnan(np.sum(jsim)):                   #Test element-wise for NaN and return result as a boolean array.
#             return 0
#         return jsim 


# jaccard_sim(x1,x2)

In [None]:
main_vec_EA=[4,2,1,1,35]
main_vec_OCD=[7,6,2,9,49]
def _cosine_sim(main_vec_EA, main_vec_OCD):
        """Find the cosine similarity distance between two vectors."""
        csim=(( np.dot(main_vec_EA,main_vec_OCD) )**2)/((np.linalg.norm(main_vec_EA)*np.linalg.norm(main_vec_OCD))**2)
        if np.isnan(np.sum(csim)):                   #Test element-wise for NaN and return result as a boolean array.
            return 0
        return csim

_cosine_sim(main_vec_EA, main_vec_OCD)

In [None]:
import numpy as np

main_vec_EA=[4,2,1,1,35]
main_vec_OCD=[7,6,2,9,49]

MAC1= np.dot(main_vec_EA,main_vec_OCD)
print("MAC1=",MAC1)


MAC2=np.dot(main_vec_EA,main_vec_EA)
print("MAC2=",MAC2)

MAC3=np.dot(main_vec_OCD,main_vec_OCD)
print("MAC3=",MAC3)



In [None]:
MULTI1_OUTP=MAC1*MAC1
MULTI2_OUTP=MAC2*MAC3

print("Multiplier_outp1=",MULTI1_OUTP)
print("Multiplier_outp2=",MULTI2_OUTP)

DIV1_inp= MULTI1_OUTP
DIV2_inp= MULTI2_OUTP

Cosine=DIV1_inp/DIV2_inp
print("Cosine Result",Cosine)