# Bash Commands

In [2]:
!pip install nltk



In [3]:
!pip install -U sentence_transformers



# Pre- requisites

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hiteshchoudhary2109/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hiteshchoudhary2109/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy

In [6]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [10]:
!python -m spacy download en_core_web_sm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 3.7.0
    Uninstalling en-core-web-sm-3.7.0:
      Successfully uninstalled en-core-web-sm-3.7.0
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Class

In [11]:
class embedder:
    def __init__ (self,address,model):
        file = open(address)
        self.data = json.load(file)
        self.model = model
        file.close()
    def embed_sentences(self,strings):
        embeddings = []
        for string in strings:
            embeddings.append(np.array(model.encode(string), dtype=np.float32))
        return np.array(embeddings)
    def extract(self):
        self.tools = []
        self.tool_descriptions = []
        self.arg_names = []
        self.arg_descriptions = []
        self.arg_to_tool_map = {}
        for key, value in self.data.items():
            tmp_tool = key
            self.tools.append(tmp_tool)
            
            tmp_tool_dsc = value.get("Description")
            self.tool_descriptions.append(tmp_tool_dsc)
            
            tmp_arg_dsc = value.get("ArgumentName")
        
            for argName, argDesc in tmp_arg_dsc.items():
                if (argDesc.get("Description") == "_"):
                    self.arg_names.append(key)
                    self.arg_descriptions.append(tmp_tool_dsc)
                    self.arg_to_tool_map[key] =key
                else:
                    self.arg_names.append(argName)
                    self.arg_descriptions.append(argDesc.get("Description"))
                    self.arg_to_tool_map[argName] =key
    def encode(self):
        self.tool_enc = self.embed_sentences(self.tools)
        self.tool_des_enc = self.embed_sentences(self.tool_descriptions)
        self.arg_enc = self.embed_sentences(self.arg_names)
        self.arg_des_enc = self.embed_sentences(self.arg_descriptions)
    def remove_stopwords(self,query):
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(query)
        filtered_query = [word for word in word_tokens if word.lower() not in stop_words]
        return ' '.join(filtered_query)
    def check(self,queries):
        self.tables = {}
        for query in queries:
            qenc = self.model.encode(query)
            table = {}
            for arg, arg_enc in zip(self.arg_names, self.arg_des_enc):
                table[arg] = util.cos_sim(qenc, arg_enc)
            self.tables[query] = table
        return self.tables
    def check_without_stopwords(self,queries):
        self.tables = {}
        for query in queries:
            quer = self.remove_stopwords(query)
            qenc = self.model.encode(quer)
            table = {}
            for arg, arg_enc in zip(self.arg_names, self.arg_des_enc):
                table[arg] = util.cos_sim(qenc, arg_enc)
            self.tables[query] = table
        return self.tables
    
        
                

# Practice

In [12]:
example = embedder("tools_dictionary.json",model)
example.extract()
example.encode()
print(example.arg_names[2])

get_sprint_id


In [13]:
queries = [
    'Get all work items similar to TKT-123, summarize them, create issues from that summary, and prioritize them',
    'Given a customer meeting transcript T, create action items and add them to my current sprint',
    'List all high severity tickets coming in from slack from customer Cust123 and generate a summary of them.',
    'What are my all issues in the triage stage under part FEAT-123? Summarize them.',
    'Summarize high severity tickets from the customer UltimateCustomer',
    'Prioritize my P0 issues and add them to the current sprint',
    'What is the meaning of life?',
    'Summarize issues similar to don:core:dvrv-us-1:devo/0:issue/1'
]
ans = example.check(queries)
fd = pd.DataFrame(ans)
fd

Unnamed: 0,"Get all work items similar to TKT-123, summarize them, create issues from that summary, and prioritize them","Given a customer meeting transcript T, create action items and add them to my current sprint",List all high severity tickets coming in from slack from customer Cust123 and generate a summary of them.,What are my all issues in the triage stage under part FEAT-123? Summarize them.,Summarize high severity tickets from the customer UltimateCustomer,Prioritize my P0 issues and add them to the current sprint,What is the meaning of life?,Summarize issues similar to don:core:dvrv-us-1:devo/0:issue/1
text,[[tensor(0.2693)]],[[tensor(0.3375)]],[[tensor(0.1868)]],[[tensor(0.1682)]],[[tensor(0.2146)]],[[tensor(0.0915)]],[[tensor(0.2365)]],[[tensor(0.1582)]]
work_id,[[tensor(0.2902)]],[[tensor(0.2009)]],[[tensor(0.1285)]],[[tensor(-0.0320)]],[[tensor(0.0587)]],[[tensor(0.0047)]],[[tensor(0.0404)]],[[tensor(0.0198)]]
get_sprint_id,[[tensor(0.0345)]],[[tensor(0.3797)]],[[tensor(0.1876)]],[[tensor(0.0415)]],[[tensor(0.1240)]],[[tensor(0.3231)]],[[tensor(0.0989)]],[[tensor(0.0155)]]
objects,[[tensor(0.4640)]],[[tensor(0.1308)]],[[tensor(0.2699)]],[[tensor(0.0355)]],[[tensor(0.3125)]],[[tensor(-0.0045)]],[[tensor(0.1341)]],[[tensor(0.2086)]]
query,[[tensor(0.0178)]],[[tensor(0.1293)]],[[tensor(0.1355)]],[[tensor(0.0910)]],[[tensor(0.1204)]],[[tensor(-0.0350)]],[[tensor(0.1401)]],[[tensor(0.0196)]]
who_am_i,[[tensor(-0.0926)]],[[tensor(-0.0195)]],[[tensor(0.0090)]],[[tensor(-0.0379)]],[[tensor(-0.0044)]],[[tensor(-0.1088)]],[[tensor(0.1166)]],[[tensor(-0.0885)]]
applies_to_part,[[tensor(0.3315)]],[[tensor(0.0586)]],[[tensor(0.0551)]],[[tensor(0.0712)]],[[tensor(0.0479)]],[[tensor(0.0254)]],[[tensor(0.0467)]],[[tensor(0.0077)]]
created_by,[[tensor(0.2264)]],[[tensor(0.0690)]],[[tensor(0.0637)]],[[tensor(-0.0654)]],[[tensor(0.0181)]],[[tensor(-0.0282)]],[[tensor(0.0426)]],[[tensor(0.0175)]]
issue.priority,[[tensor(0.3121)]],[[tensor(0.0308)]],[[tensor(0.2446)]],[[tensor(0.1894)]],[[tensor(0.2554)]],[[tensor(0.3604)]],[[tensor(0.0184)]],[[tensor(0.1548)]]
issue.rev_orgs,[[tensor(0.1548)]],[[tensor(0.0727)]],[[tensor(0.0927)]],[[tensor(0.1249)]],[[tensor(0.1135)]],[[tensor(0.0649)]],[[tensor(-0.0311)]],[[tensor(0.1483)]]


In [14]:
querries = [
    'Get all work items similar to TKT-123, summarize them, create issues from that summary, and prioritize them',
    'Given a customer meeting transcript T, create action items and add them to my current sprint',
    'List all high severity tickets coming in from slack from customer Cust123 and generate a summary of them.',
    'What are my all issues in the triage stage under part FEAT-123? Summarize them.',
    'Summarize high severity tickets from the customer UltimateCustomer',
    'Prioritize my P0 issues and add them to the current sprint',
    'What is the meaning of life?',
    'Summarize issues similar to don:core:dvrv-us-1:devo/0:issue/1'
]
ans = example.check_without_stopwords(querries)
fd = pd.DataFrame(ans)
fd

Unnamed: 0,"Get all work items similar to TKT-123, summarize them, create issues from that summary, and prioritize them","Given a customer meeting transcript T, create action items and add them to my current sprint",List all high severity tickets coming in from slack from customer Cust123 and generate a summary of them.,What are my all issues in the triage stage under part FEAT-123? Summarize them.,Summarize high severity tickets from the customer UltimateCustomer,Prioritize my P0 issues and add them to the current sprint,What is the meaning of life?,Summarize issues similar to don:core:dvrv-us-1:devo/0:issue/1
text,[[tensor(0.2948)]],[[tensor(0.3341)]],[[tensor(0.2377)]],[[tensor(0.1817)]],[[tensor(0.2234)]],[[tensor(0.0604)]],[[tensor(0.2726)]],[[tensor(0.1332)]]
work_id,[[tensor(0.4377)]],[[tensor(0.2015)]],[[tensor(0.1327)]],[[tensor(0.0255)]],[[tensor(0.0496)]],[[tensor(0.0154)]],[[tensor(0.0455)]],[[tensor(0.0074)]]
get_sprint_id,[[tensor(0.0970)]],[[tensor(0.4245)]],[[tensor(0.1650)]],[[tensor(0.0751)]],[[tensor(0.1066)]],[[tensor(0.3943)]],[[tensor(0.0645)]],[[tensor(0.0248)]]
objects,[[tensor(0.5151)]],[[tensor(0.1239)]],[[tensor(0.3479)]],[[tensor(0.0781)]],[[tensor(0.3333)]],[[tensor(-0.0205)]],[[tensor(0.1028)]],[[tensor(0.1587)]]
query,[[tensor(0.1079)]],[[tensor(0.1479)]],[[tensor(0.1400)]],[[tensor(0.1146)]],[[tensor(0.1141)]],[[tensor(-0.0358)]],[[tensor(0.1024)]],[[tensor(0.0271)]]
who_am_i,[[tensor(-0.0338)]],[[tensor(0.0498)]],[[tensor(0.0197)]],[[tensor(-0.0448)]],[[tensor(-0.0143)]],[[tensor(-0.0574)]],[[tensor(0.0901)]],[[tensor(-0.0600)]]
applies_to_part,[[tensor(0.3265)]],[[tensor(0.0335)]],[[tensor(0.0677)]],[[tensor(0.0558)]],[[tensor(0.0444)]],[[tensor(-0.0353)]],[[tensor(0.0399)]],[[tensor(-0.0255)]]
created_by,[[tensor(0.2534)]],[[tensor(0.0524)]],[[tensor(0.0799)]],[[tensor(-0.0823)]],[[tensor(0.0179)]],[[tensor(-0.0655)]],[[tensor(0.0525)]],[[tensor(-0.0202)]]
issue.priority,[[tensor(0.2798)]],[[tensor(0.0264)]],[[tensor(0.2341)]],[[tensor(0.1539)]],[[tensor(0.2527)]],[[tensor(0.3386)]],[[tensor(0.0096)]],[[tensor(0.1329)]]
issue.rev_orgs,[[tensor(0.1310)]],[[tensor(0.0473)]],[[tensor(0.0807)]],[[tensor(0.0938)]],[[tensor(0.1058)]],[[tensor(0.0102)]],[[tensor(-0.0057)]],[[tensor(0.1413)]]


# Agrim_list

In [15]:
layer2 = {'works_list':"work items",
          'summarize_objects':"Summarizes objects.",
          'prioritize_objects':"Sorts objects based on priority.",
          'add_work_items_to_sprint':"Adds work items to sprint",
          'get_sprint_id':"Returns ID current sprint",
          'get_similar_work_items':"get work items similar",
          'search_object_by_name':"returns id matching object in the system of record.",
          'create_actionable_tasks_from_text':"create actionable tasks from text",
          'who_am_i':"Returns ID current user, i, my , me"}

In [16]:
qu = 'add them to the current sprint'
t = example.comp_tool_1(qu)
df = pd.DataFrame(list(t.items()), columns=['Key', 'Value'])
print(df)
t2 = example.comp_tool_2(qu,layer2,t)
df2 = pd.DataFrame(list(t2.items()), columns=['Key', 'Value'])
df2

                                 Key                Value
0  create_actionable_tasks_from_text   [[tensor(0.0474)]]
1             get_similar_work_items   [[tensor(0.0191)]]
2                      get_sprint_id   [[tensor(0.6450)]]
3                 prioritize_objects  [[tensor(-0.0320)]]
4              search_object_by_name  [[tensor(-0.0862)]]
5                  summarize_objects  [[tensor(-0.0106)]]
6                           who_am_i   [[tensor(0.0978)]]
7                         works_list  [[tensor(-0.0240)]]
8           add_work_items_to_sprint   [[tensor(0.6473)]]


Unnamed: 0,Key,Value
0,get_sprint_id,[[tensor(0.4587)]]
1,add_work_items_to_sprint,[[tensor(0.7120)]]


1. Create disjoint set of the tool description for 2nd level -- Agrim
2. Code improvement -- Hitesh
3. Rephrase the tool description -- Abhinav Bhaiya
4. 1 - 2
 

# Queries

# Anything extra you want to add, add here code rewriting is happening above so don't use anything!

Every query jhas k tools as output, goal is extract no.1 tool 
add that tool into query and use that to get new tool
sentences ka fourier transform
semantically reduce and split
subqueries --(semantic splitting)
complex queries to simple queries   --- query - sentence

Fact-aware Sentence Split and Rephrase with Permutation Invariant Training - arXiv https://arxiv.org/pdf/2001.11383 --- Hitesh
https://copyprogramming.com/howto/decompose-compound-sentence-to-simple-sentences --- Hitesh
https://stackoverflow.com/questions/44626264/split-compound-sentences-into-simple-sentences --- Agrim
https://towardsdatascience.com/nlp-splitting-text-into-sentences-7bbce222ef17 --- Abhinav
https://github.com/explosion/spaCy/discussions/11122 --- Abhinav
https://subscription.packtpub.com/book/data/9781838987312/2/ch02lvl1sec13/splitting-sentences-into-clauses -- Agrim

In [20]:
quu = 'Get all work items similar to TKT-123, summarize them, create issues from that summary, and prioritize them'
t = example.basic(quu,layer2,0.4)
t

Get all work items similar to TKT-123
                                 Key               Value
0  create_actionable_tasks_from_text  [[tensor(0.3816)]]
1             get_similar_work_items  [[tensor(0.6598)]]
2                      get_sprint_id  [[tensor(0.2363)]]
3                 prioritize_objects  [[tensor(0.1805)]]
4              search_object_by_name  [[tensor(0.2138)]]
5                  summarize_objects  [[tensor(0.2554)]]
6                           who_am_i  [[tensor(0.1278)]]
7                         works_list  [[tensor(0.6074)]]
8           add_work_items_to_sprint  [[tensor(0.4773)]]
                        Key               Value
0    get_similar_work_items  [[tensor(0.4423)]]
1                works_list  [[tensor(0.3812)]]
2  add_work_items_to_sprint  [[tensor(0.3452)]]
summarize them
                                 Key               Value
0  create_actionable_tasks_from_text  [[tensor(0.4258)]]
1             get_similar_work_items  [[tensor(0.2164)]]
2             

{'get_similar_work_items': ['work_id'],
 'summarize_objects': ['objects'],
 None: []}

BERT
LLAMA
Rank by priority -- normalize the score -- weighing the argument -- binary classifier 
ek sub query ke sath full query 
transformer excel
first order logic 