In [1]:
import time 
start=time.time()

In [12]:
#importing libraries

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
import csv
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#Loading pre=trained word2vec model

from gensim.models.keyedvectors import KeyedVectors

# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory

model_path = 'D:\GoogleNews_vectors_negative300\GoogleNews_vectors_negative300.bin'   
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [59]:
#Setting Parameters for model

class DocSim(object):
    def __init__(self, w2v_model , stopwords=[]):
        self.w2v_model = w2v_model
        self.stopwords = stopwords

    def vectorize(self, doc):
        """Identify the vector values for each word in the given document"""
        doc = doc.lower()
        words = [w for w in doc.split(" ") if w not in self.stopwords]
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors

        vector = np.mean(word_vecs, axis=0)
        return vector
    
    def jaccard_similarity(self,vecA, vecB):
        vectors_minimum=np.minimum(vecA,vecB)
        vectors_maximum=np.maximum(vecA,vecB)
        jaccard_sim=np.linalg.norm(vectors_minimum)/np.linalg.norm(vectors_maximum)
        return jaccard_sim


    def calculate_similarity(self, withdigits_source_rules, withdigits_target_rules=[], threshold=0.8):
        """Calculates & returns similarity scores between given source rules & all
        the target rules"""
        if isinstance(withdigits_target_rules, str):
            withdigits_target_rules = [withdigits_target_rules]


        source_vec = self.vectorize(withdigits_source_rules)
        results = []
        
        for rule in withdigits_target_rules:
            target_vec = self.vectorize(rule)
            sim_score = self.jaccard_similarity(source_vec, target_vec)
            if sim_score > threshold:
                results.append({
                    'Cosine Sim Score':sim_score,
                    'Target Rule':rule
                })

                
            # Sort results by score in desc order
            results.sort(key=lambda k : k['Cosine Sim Score'] , reverse=True)

        return results

In [60]:
ds = DocSim(object)

In [61]:
# Importing Source rules or OCD rulebook CSV file

with open ('Source rules or OCD rulebook.csv') as file_object:
    for EAline in file_object:
        print(EAline)

2.1.3 Controls – Operations un- der degraded conditions

2.1.1.4 - Written operating procedures or spoken instructions when the signalling system cannot be used to preserve the effectiveness of the space interval

2.3.2 Controls – Level Crossings

2.6.1.4 The Route Availability (RA) system provides a consistent and simple method for assessing the compatibility of the weight of rail vehicles with the capacity of underline bridges (other than bridges defined as ‘long span’).

2.2.1.1 The safety benefits of a system for maintaining space intervals between trains (see section 2.1 of this operational concept document) are compromised if a train proceeds without an authority for its movement.

2.8.6 Controls – railway workforce (Communication between train drivers and signallers)



In [62]:
#Sentences tokenization of Source rules or OCD rulebook CSV file

source_rules = []
with open ('Source rules or OCD rulebook.csv') as file_object:
    for trainline in file_object:
        tokens_train = sent_tokenize(trainline)
        source_rules.extend(tokens_train)

        
# Searching for text that includes any digits or decimal number. "\d" is looking for digits in tokens


withdigits_source_rules=[s for s in source_rules if re.findall("^\d",s)]
print(withdigits_source_rules)

['2.1.3 Controls – Operations un- der degraded conditions', '2.1.1.4 - Written operating procedures or spoken instructions when the signalling system cannot be used to preserve the effectiveness of the space interval', '2.3.2 Controls – Level Crossings', '2.6.1.4 The Route Availability (RA) system provides a consistent and simple method for assessing the compatibility of the weight of rail vehicles with the capacity of underline bridges (other than bridges defined as ‘long span’).', '2.2.1.1 The safety benefits of a system for maintaining space intervals between trains (see section 2.1 of this operational concept document) are compromised if a train proceeds without an authority for its movement.', '2.8.6 Controls – railway workforce (Communication between train drivers and signallers)']


In [63]:
# Importing Target Rules or EA rulebook CSV file

with open ('Target Rules or EA rulebook.csv') as file_object:
    for OCDline in file_object:
        print(OCDline)

2.1.3 Controls – Operations un- der degraded conditions

2.1.1.4 Operating procedures using written or spoken instructions are applied when the signalling system cannot be used to preserve the effectiveness of the space interval:

2.3.2 Controls – Level Crossings

2.6.1.4 Route availability system provides consistent and simple method to assess compatibility of train weight with underlying bridges

2.2.1.1 Safety is compromised if a train proceeds without a movement autority

2.8.6 Control communication between train drivers and signallers



In [64]:
#Sentences tokenization of Target Rules or EA rulebook CSV file

target_rules = []
with open ('Target Rules or EA rulebook.csv') as file_object:
    for trainline in file_object:
        tokens_train = sent_tokenize(trainline)
        target_rules.extend(tokens_train)

    
# Searching for text that includes any digits or decimal number. "\d" is looking for digits in tokens

withdigits_target_rules=[s for s in target_rules if re.findall("^\d",s)]
print(withdigits_target_rules)

['2.1.3 Controls – Operations un- der degraded conditions', '2.1.1.4 Operating procedures using written or spoken instructions are applied when the signalling system cannot be used to preserve the effectiveness of the space interval:', '2.3.2 Controls – Level Crossings', '2.6.1.4 Route availability system provides consistent and simple method to assess compatibility of train weight with underlying bridges', '2.2.1.1 Safety is compromised if a train proceeds without a movement autority', '2.8.6 Control communication between train drivers and signallers']


In [65]:
#Calculate the similarity score between a source rule & a target rule.


if isinstance(withdigits_source_rules, str):
    withdigits_source_rules = [withdigits_source_rules]
   

# This will return one target rules text with a similarity score

for rule in withdigits_source_rules:
    sim_scores= ds.calculate_similarity(rule, withdigits_target_rules)

    
    
    # Source Rules are OCD Rules
    # Target Rules are EA Rules
    
    # Printing the output in text file
    
    print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) , file=open("output.txt", "a"))
    print("\n")
    
    
    # Printing output in Jupyter
    
    print("Source rule: {} \n\nSimilarity with Target Rule is \n\n {}\n".format(rule, sim_scores) )
    print("\n")

TypeError: 'type' object is not subscriptable

In [11]:
done=time.time()
elapsed=done-start
print(elapsed)

70.65496945381165


In [2]:
from numba import jit, cuda
import numpy as np
vecA=[4,2,1,1]
vecB=[7,8,1,1]


# def jaccard_similarity(vecA, vecB):
#     vectors_minimum=np.minimum(vecA,vecB)
#     vectors_maximum=np.maximum(vecA,vecB)
#     jaccard_sim=np.linalg.norm(vectors_minimum)/np.linalg.norm(vectors_maximum)
#     return jaccard_sim

@jit(target ="cuda")
def jaccard_sim(self, vecA, vecB):
        """Find the cosine similarity distance between two vectors."""
        jsim=(( np.dot(vecA,vecB) ))/((np.linalg.norm(vecA)+np.linalg.norm(vecB)))-(np.dot(vecA,vecB))
        if np.isnan(np.sum(jsim)):                   #Test element-wise for NaN and return result as a boolean array.
            return 0
        return jsim    


jaccard_similarity(vecA,vecB)

start = timer()
jaccard_sim()
print("with GPU:", timer()-start)    



TypeError: __init__() got an unexpected keyword argument 'locals'

In [3]:
from numba import jit, cuda
import numpy as np
# to measure exec time
from timeit import default_timer as timer   
  
# normal function to run on cpu
def func(a):                                
    for i in range(10000000):
        a[i]+= 1      
  
# function optimized to run on gpu 
@jit(target ="cuda")                         
def func2(a):
    for i in range(10000000):
        a[i]+= 1
if __name__=="__main__":
    n = 10000000                            
    a = np.ones(n, dtype = np.float64)
    b = np.ones(n, dtype = np.float32)
      
    start = timer()
    func(a)
    print("without GPU:", timer()-start)    
      
    start = timer()
    func2(a)
    print("with GPU:", timer()-start)

TypeError: __init__() got an unexpected keyword argument 'locals'