In [34]:
import os
import re
import csv
import copy
import pickle
import pandas as pd
from itertools import combinations

In [2]:
entities = ['action', 'label', 'data']

In [3]:
label_syn = [("username", "user name"), ("next", "previous"), ("complete", "ok"), ("user id", "userid")]

In [4]:
action_syn = [("select", "click", "deselect"), ("verify", "validate"), ("clicking", "selecting"), 
              ("navigate", "go to", "on"), ("login", "log in", "enter"), ("logout", "log out")]

In [5]:
action_dict = {}

for item in action_syn:
    pairs = list(combinations(item, 2))
    for x, y in pairs:
        action_dict.setdefault(x, []).append(y)
        action_dict.setdefault(y, []).append(x)

In [6]:
action_dict

{'select': ['click', 'deselect'],
 'click': ['select', 'deselect'],
 'deselect': ['select', 'click'],
 'verify': ['validate'],
 'validate': ['verify'],
 'clicking': ['selecting'],
 'selecting': ['clicking'],
 'navigate': ['go to', 'on'],
 'go to': ['navigate', 'on'],
 'on': ['navigate', 'go to'],
 'login': ['log in', 'enter'],
 'log in': ['login', 'enter'],
 'enter': ['login', 'log in'],
 'logout': ['log out'],
 'log out': ['logout']}

In [7]:
label_dict = {}

for item in label_syn:
    pairs = list(combinations(item, 2))
    for x, y in pairs:
        label_dict.setdefault(x, []).append(y)
        label_dict.setdefault(y, []).append(x)

In [8]:
label_dict

{'username': ['user name'],
 'user name': ['username'],
 'next': ['previous'],
 'previous': ['next'],
 'complete': ['ok'],
 'ok': ['complete'],
 'user id': ['userid'],
 'userid': ['user id']}

In [9]:
with open("data.pkl", "rb") as pkl_file:
    data_list = pickle.load(pkl_file)

In [10]:
data_list[:2]

[['Select Asset Class as "CC" and select Deal Type as "TRADE CC" ',
  ('select', 'Asset Class', 'CC'),
  ('select', 'Deal Type', 'TRADE CC')],
 ['Select the following for the Equipment Information section:\na. - Site\nb. - Legal Product Category',
  ('select', 'site', ''),
  ('select', 'Legal Product Category', '')]]

In [11]:
preprocess_list = []

def clean_data(text):
    text = re.sub('([!?,\'".\n\-\:\*\/])', r' \1 ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()

def preprocess_data(data_list):
    for item in data_list:
        text, triplets = item[0], item[1:] 
        text = clean_data(text)
        tmp_list = [text]
        for action, label, data in triplets:
            if action: action = clean_data(action)
            if label: label = clean_data(label)
            if data: data = clean_data(data)
            tmp_list.append((action, label, data))
        preprocess_list.append(tmp_list)   
        
preprocess_data(data_list)

In [12]:
text = """Select Asset Class as "CC" and: select Deal Type as "TRADE CC" test122 """

clean_data(text)

'Select Asset Class as " CC " and : select Deal Type as " TRADE CC " test122'

In [13]:
augmented_list = []

def augment_data(data_list):
    for item in data_list:
        text, triplets = item[0], item[1:]
        augmented_list.append(item)
        unique_actions, unique_labels = set(), set()
        
        for action, label, data in triplets:
            if action: unique_actions.add(action.lower())
            if label: unique_labels.add(label.lower())
        
        for action in unique_actions:
            for similar_action in action_dict.get(action, []):
                new_item = copy.deepcopy(item)
                new_text, new_triplets = new_item[0].lower(), new_item[1:]
                if action in new_text: 
                    new_text = new_text.replace(action, similar_action)
                    new_triplets = [(similar_action, y, z) if x.lower() == action else (x,y,z) for (x,y,z) in new_triplets]
                    augmented_list.append([new_text, *new_triplets])
                    
        for label in unique_labels:
            for similar_label in label_dict.get(label, []):
                new_item = copy.deepcopy(item)
                new_text, new_triplets = new_item[0].lower(), new_item[1:]
                if label in new_text: 
                    new_text = new_text.replace(label, similar_label)
                    new_triplets = [(x, similar_label, z) if y.lower() == label else (x,y,z) for (x,y,z) in new_triplets]
                    augmented_list.append([new_text, *new_triplets])
        
augment_data(preprocess_list)

In [14]:
from pprint import pprint
len(augmented_list)

434

In [15]:
augmented_list

[['Select Asset Class as " CC " and select Deal Type as " TRADE CC "',
  ('select', 'Asset Class', 'CC'),
  ('select', 'Deal Type', 'TRADE CC')],
 ['click asset class as " cc " and click deal type as " trade cc "',
  ('click', 'Asset Class', 'CC'),
  ('click', 'Deal Type', 'TRADE CC')],
 ['deselect asset class as " cc " and deselect deal type as " trade cc "',
  ('deselect', 'Asset Class', 'CC'),
  ('deselect', 'Deal Type', 'TRADE CC')],
 ['Select the following for the Equipment Information section : \n a . - Site \n b . - Legal Product Category',
  ('select', 'site', ''),
  ('select', 'Legal Product Category', '')],
 ['click the following for the equipment information section : \n a . - site \n b . - legal product category',
  ('click', 'site', ''),
  ('click', 'Legal Product Category', '')],
 ['deselect the following for the equipment information section : \n a . - site \n b . - legal product category',
  ('deselect', 'site', ''),
  ('deselect', 'Legal Product Category', '')],
 ["Go 

In [16]:
# def check_presense(text, item_list):
#     if text in item_list: return True
#     for item in item_list:
#         if text in item.split(" "): return True
#     return False

# def prepare_data_for_training(data_list):
#     for item in data_list:
#         tag_list = []
#         text, triplets = item[0], item[1:]
#         token_list = text.lower().split(" ")
#         tag_list = ["O" for i in range(len(token_list))]
#         unique_actions, unique_labels, unique_data = set(), set(), set()
        
#         for action, label, data in triplets:
#             if action: unique_actions.add(action.lower())
#             if label: unique_labels.add(label.lower())
#             if data: unique_data.add(data.lower())
                
#         for i in range(len(token_list)):
#             val = token_list[i]
#             #if val.lower() in unique_actions: tag_list[i] = "action"
#             #if val.lower() in unique_labels: tag_list[i] = "label"
#             #if val.lower() in unique_data: tag_list[i] = "data"
#             if check_presense(val.lower(), unique_actions):tag_list[i] = "action"
#             if check_presense(val.lower(), unique_labels):tag_list[i] = "label"
#             if check_presense(val.lower(), unique_data):tag_list[i] = "data"
#         processed_data.append((token_list, tag_list))
        
# prepare_data_for_training(preprocess_list)

In [24]:
def prepare_char_index_map(string):
    d1, i = {0:0}, 1
    for index, char in enumerate(string):
        if char == " ":
            d1[index+1] = i
            i+=1
    return d1

errors = 0

def populate_tag_list(char_index_map, text, token_list, tag_list, val, label):
    
    index, i, occurrence = "", "", ""
    try:
        if len(val.split(" ")) == 1:
            for index, x in enumerate(token_list):
                if x == val: tag_list[index] = label
        else:
            occurrence = text.find(val)
            words = val.split(" ")
            index = char_index_map[occurrence]
            for i, word in enumerate(words):
                if token_list[index+i] == word:
                    tag_list[index+i] = label
                else:
                    raise Exception
    except Exception as e:
        #pass
        print(words, token_list, index, i)
        print(occurrence)
        print(text)
        print(char_index_map)
        global errors
        errors += 1
        print("=-------------", errors)

def prepare_data_for_training(data_list):
    processed_data = []
    for item in data_list:
        text, triplets = item[0], item[1:]
        token_list = text.lower().split(" ")
        char_index_map = prepare_char_index_map(text.lower())
        tag_list = ["O" for i in range(len(token_list))]
        
        unique_actions, unique_labels, unique_data = set(), set(), set()
        for action, label, data in triplets:
            if action: unique_actions.add(action.lower())
            if label: unique_labels.add(label.lower())
            if data: unique_data.add(data.lower())
                
        for action in unique_actions:
            populate_tag_list(char_index_map, text.lower(), token_list, tag_list, action, "action")
            
        for label in unique_labels:
            populate_tag_list(char_index_map, text.lower(), token_list, tag_list, label, "label")
            
        for data in unique_data:
            populate_tag_list(char_index_map, text.lower(), token_list, tag_list, data, "data")
            
        #print(token_list)
        #print(tag_list)
        #print("\n")
        processed_data.append((token_list, tag_list))
    
    return processed_data
        

processed_data = prepare_data_for_training(augmented_list)

In [23]:
l1 = [['click the following for the equipment information section : \n a . - site \n b . - legal product category', ('click', 'site', ''), ('click', 'Legal Product Category', '')]]

print(l1)

l2 = prepare_data_for_training(l1)

print(l2)

[['click the following for the equipment information section : \n a . - site \n b . - legal product category', ('click', 'site', ''), ('click', 'Legal Product Category', '')]]
[(['click', 'the', 'following', 'for', 'the', 'equipment', 'information', 'section', ':', '\n', 'a', '.', '-', 'site', '\n', 'b', '.', '-', 'legal', 'product', 'category'], ['action', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'label', 'O', 'O', 'O', 'O', 'label', 'label', 'label'])]


In [25]:
for index, x in enumerate(processed_data):
    print(augmented_list[index])
    print(list(zip(x[0], x[1])))
    print("\n")
    if index == 50: break

['Select Asset Class as " CC " and select Deal Type as " TRADE CC "', ('select', 'Asset Class', 'CC'), ('select', 'Deal Type', 'TRADE CC')]
[('select', 'action'), ('asset', 'label'), ('class', 'label'), ('as', 'O'), ('"', 'O'), ('cc', 'data'), ('"', 'O'), ('and', 'O'), ('select', 'action'), ('deal', 'label'), ('type', 'label'), ('as', 'O'), ('"', 'O'), ('trade', 'data'), ('cc', 'data'), ('"', 'O')]


['click asset class as " cc " and click deal type as " trade cc "', ('click', 'Asset Class', 'CC'), ('click', 'Deal Type', 'TRADE CC')]
[('click', 'action'), ('asset', 'label'), ('class', 'label'), ('as', 'O'), ('"', 'O'), ('cc', 'data'), ('"', 'O'), ('and', 'O'), ('click', 'action'), ('deal', 'label'), ('type', 'label'), ('as', 'O'), ('"', 'O'), ('trade', 'data'), ('cc', 'data'), ('"', 'O')]


['deselect asset class as " cc " and deselect deal type as " trade cc "', ('deselect', 'Asset Class', 'CC'), ('deselect', 'Deal Type', 'TRADE CC')]
[('deselect', 'action'), ('asset', 'label'), ('cla

In [28]:
len(processed_data)

434

In [26]:
def sanity(data_list):
    max_len = 0
    for token_list, tag_list in data_list:
        len1, len2 = len(token_list), len(tag_list)
        if len1 == len2:
            if max_len < len1: max_len = len1
        else:
            raise Exception
    
    return max_len
            
max_len = sanity(processed_data)

In [27]:
max_len

102

In [29]:
headers = [x for x in range(max_len)]

In [31]:
path = r"C:\Users\anu10961\Work\POC\submission\autonomiq\data"

In [36]:
with open(os.path.join(path, 'tags.csv'), mode='w') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(headers)
    for item in processed_data:
        csv_writer.writerow(item[1])

In [37]:
with open(os.path.join(path, 'tokens.csv'), mode='w') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(headers)
    for item in processed_data:
        csv_writer.writerow(item[0])