In [23]:
! pip install sentence_transformers
import pandas as pd
import numpy as np

In [24]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [25]:
# ! pip install "weaviate-client==3.*"
! pip install -U weaviate-client

In [27]:
import weaviate
import weaviate.classes as wvc
import pandas as pd
import json

In [26]:
f = open("/content/sample_data/WCS_CLUSTER_URL.txt", "r")
WCS_CLUSTER_URL = f.read()
# print(WCS_CLUSTER_URL)

f = open("/content/sample_data/WCS_DEMO_RO_KEY.txt", "r")
WCS_DEMO_RO_KEY = f.read()
# print(WCS_DEMO_RO_KEY)

f = open("/content/sample_data/OPENAI_API_KEY.txt", "r")
openai_api_key = f.read()
# print(openai_api_key)

# Make client connnection with weaviate cloud cluster

In [30]:
client = weaviate.connect_to_wcs(
    cluster_url= WCS_CLUSTER_URL, # "https://some-endpoint.weaviate.network/",  # Replace with your endpoint
    auth_credentials =  weaviate.auth.AuthApiKey(api_key=WCS_DEMO_RO_KEY),  # Replace w/ your Weaviate instance API key
    headers = {
        "X-OpenAI-Api-Key": openai_api_key  # Replace with your inference API key
    }
)

print(client.is_ready())

True


# Vector similarity measure

In [3]:
#Sentences we want to encode. Example:
sentence = ['The team enjoyed the hike through the meadow',
            'The national park had great views',
            'Olive oil drizzled over pizza tastes delicious']

In [4]:
#Sentences are encoded by calling model.encode()
embedding = model.encode(sentence)

#Preview the embeddings
print(embedding)

[[ 0.37061968  0.26414266  0.21265659 ...  0.14994559 -0.25794914
  -0.23970744]
 [ 0.6693328   0.40094966 -0.48208374 ...  0.10645917 -1.506716
  -0.01547341]
 [-0.26555893  0.11172437 -0.14733039 ...  0.4219748   0.883946
   0.10763916]]


In [5]:
embedding.shape

(3, 384)

In [6]:
# Cosine Distance function
def cosine_distance(vec1,vec2):
  cosine = 1 - (np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
  return cosine

# Similar vectors will have SMALLER angle between them

In [7]:
#Cosine Distance

# embedding0 - 'The team enjoyed the hike through the meadow'
# embedding1 - The national park had great views'
# embedding2 - 'Olive oil drizzled over pizza tastes delicious'

print("Distance 0-1: ", cosine_distance(embedding[0], embedding[1])) # Least angle
print("Distance 0-2: ", cosine_distance(embedding[0], embedding[2]))
print("Distance 1-2: ", cosine_distance(embedding[1], embedding[2]))

Distance 0-1:  0.5350335240364075
Distance 0-2:  0.9639393426477909
Distance 1-2:  0.9288790747523308


# Create new vector database collection


In [None]:
# client.collections.delete("Clause")

In [None]:
# Create collection - 1st time only

# clauses = client.collections.create(
#         name="Clause",
#         vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),  # If set to "none" you must always provide vectors yourself. Could be any other "text2vec-*" also.
#         generative_config=wvc.config.Configure.Generative.openai()  # Ensure the `generative-openai` module is used for generative queries
#     )

# Read legal clauses to upload to vector database collection

In [None]:
# train_data = pd.read_csv('/content/sample_data/all_short_clauses_train.csv') ## .head(5)

In [None]:
# train_data.head()

Unnamed: 0,clause,category
0,"This Release may be amended, only upon a writt...",Waivers
1,Neither the Borrower nor Pangaea maintains any...,Erisa
2,"The Employee Retirement Income Act of 1974, as...",Erisa
3,[ . No Obligor or Subsidiary has any Multiempl...,Erisa
4,Seller shall violate the representations and w...,Erisa


This Release may be amended, only upon a writt...	Waivers

Neither the Borrower nor Pangaea maintains any...	Erisa

The Employee Retirement Income Act of 1974, as...	Erisa

No Obligor or Subsidiary has any Multiempl...	Erisa

Seller shall violate the representations and w...	Erisa

In [29]:
# train_data.shape # (339, 2)

In [28]:
## Insert into Create collection - 1st time only
# clause_objs = list()
# for i, row in train_data.iterrows():
#   print(i,"th clause inserted")
#   clause_objs.append({
#       "clause": row["clause"],
#       "category": row["category"],
#   })

# clauses = client.collections.get("Clause")
# clauses.data.insert_many(clause_objs)  # This uses batching under the hood

# Get pre-created collection

In [31]:
clauses = client.collections.get("Clause")

# Semantic search

In [99]:
# A nearText search looks for objects in Weaviate whose vectors are most similar to the vector for the given input text.
# Use the Near Text operator to find objects with the nearest vector to an input text.

# If you have an object ID, use the Near Object operator to find similar objects to that object.
# near_object="56b9449e-65db-5df4-887b-0a4773f52aa7",

# If you have an input vector, use the Near Vector operator to find objects with similar vectors
# near_vector=query_vector, # your query vector goes here

# To set a similarity threshold between the search and target vectors, define a maximum distance (or certainty).
# distance=0.18, # max accepted distance

response = clauses.query.near_text(
    query="salary",
    limit=1,                                                          # Top n results
    filters=wvc.query.Filter.by_property("category").equal("Taxes"),  # OPTIONAL
    return_metadata=wvc.query.MetadataQuery(distance=True)            # OPTIONAL
)

In [100]:
for o in response.objects:
        print(o.properties)
        print(o.metadata.distance)

{'clause': 'All payments under this Agreement shall be subject to applicable tax withholding.', 'category': 'Taxes'}
0.21400493383407593


### Classification using Semantic search - Get classes of vectors most similar to new vector NOT stored in vector db

In [101]:
query_vector = ["An instrument certifying that Seller’s representations and warranties set forth in Section 8 above are true and correct as of the Closing Date"]
response = clauses.query.near_text(
  query = query_vector, # your query vector goes here
  limit=2,
  return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)

{'clause': 'All of the representations and warranties contained herein shall survive the Closing Dates.', 'category': 'Warranties'}
0.14580261707305908
{'clause': 'The representations and warranties of the Sellers to the Buyer are as set forth in this Section 4.', 'category': 'Representations'}
0.15560179948806763


In [102]:
query_vector = ["The Holders shall have received the Company’s draft internally prepared financial statements for the fiscal quarter ended June 30, 2016."]
response = clauses.query.near_text(
  query = query_vector, # your query vector goes here
  limit=2,
  return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)

{'clause': 'The DIP Agent shall have received the financial statements described in Section\xa03.04(a).', 'category': 'Financial Statements'}
0.13827496767044067
{'clause': 'A copy of the Original Financial Statements of each Borrower, HoldCo, and the Guarantor.', 'category': 'Financial Statements'}
0.15304851531982422


In [103]:
query_vector = ["Each Obligor shall immediately advise the Lender in writing of any material actions, suits, litigation or other proceeding against it."]
response = clauses.query.near_text(
  query = query_vector, # your query vector goes here
  limit=2,
  return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)

{'clause': 'To promptly give notice in writing to Bank of any litigation pending or threatened against Borrower.', 'category': 'Litigations'}
0.09342038631439209
{'clause': 'The Borrower shall furnish a copy of this Guaranty to the Lender at the closing of the Loan.', 'category': 'Disclosures'}
0.14185822010040283


In [104]:
query_vector = ["The Severance Benefit hereunder shall be reduced by all applicable income, employment or other taxes withheld by the Company from such payment."]
response = clauses.query.near_text(
  query = query_vector, # your query vector goes here
  limit=1,
  return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)

{'clause': 'The Company shall withhold from any payments any deductions required by law.', 'category': 'Withholdings'}
0.1260209083557129


In [105]:
query_vector = ["No Event of Default or event that, with the giving of notice, the passage of time, or both, would be an Event of Default, has occurred."]
response = clauses.query.near_text(
  query = query_vector, # your query vector goes here
  limit=1,
  return_metadata=wvc.query.MetadataQuery(distance=True)
)

for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)

{'clause': 'Neither a Default nor an Event of Default shall have occurred and be continuing.', 'category': 'No Defaults'}
0.10491824150085449


In [106]:
query_vector = "The law applicable to the arbitration of any dispute shall be the laws of the State of California, excluding its conflicts of law rules."
response = clauses.query.near_text(
  query = query_vector, # your query vector goes here
  limit=2,
  return_metadata=wvc.query.MetadataQuery(distance=True),
  include_vector=True
)

for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)
  print(len(o.vector['default']))
  print(o.vector['default'][0:5],"...")

{'clause': 'This Agreement shall be governed and construed in accordance with the laws of the State of Florida.', 'category': 'Applicable Laws'}
0.13691699504852295
1536
[-0.012182137928903103, -0.008257864974439144, 0.012857840396463871, -0.03833312541246414, -0.027184033766388893] ...
{'clause': 'THIS AGREEMENT SHALL BE GOVERNED BY THE LAWS OF THE STATE OF NEW YORK.', 'category': 'Governing Laws'}
0.14608228206634521
1536
[0.0015277097700163722, -0.00395712535828352, 0.01654208078980446, -0.0295551847666502, -0.023094041272997856] ...


In [107]:
query_vector = ["This Amendment shall be effective December 8, 2015. Except as hereby amended, the Plan shall remain in full force and effect."]

response = clauses.query.near_text(
  query = query_vector, # your query vector goes here
  limit=2,
  return_metadata=wvc.query.MetadataQuery(distance=True),
  include_vector=True
)

for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)
  print(len(o.vector['default']))
  print(o.vector['default'][0:5],"...")

{'clause': 'The Plan shall be effective as of May 16, 2016 (the “ Effective Date ”).', 'category': 'Effectiveness'}
0.11868143081665039
1536
[-0.035327132791280746, -0.01545395702123642, -0.024319017305970192, -0.024012867361307144, -0.007693700958043337] ...
{'clause': 'This Amendment Number Six shall become effective as of June 26, 2015.', 'category': 'Effective Dates'}
0.12051248550415039
1536
[-0.03562600910663605, -0.00480938283726573, -0.009920354932546616, -0.01681198552250862, -0.020046046003699303] ...


# Generative search (single prompt)- Also called Retrieval Augmented Generation (RAG)

1. Uses a multi-stage process.
2. **First Weaviate performs a query, then it passes the retrieved results and a prompt to a large language model (LLM), to generate a new output.**
3. Use search results as a prompt for an LLM.
4. Prompts a large language model (LLM) with a combination of a user query as well as data retrieved from a database.
5. LLM uses query results to perform a task that is based on our prompt


*   "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY"
*   generative-openai module will perform retrieval augmented generation, or RAG, based on the data stored in your Weaviate instance.



In [34]:
query_vector = ["applicable law"]

prompt = "Find similar clauses to this: {clause} and extract name of a State from the response and output State found None if no state found"

response = clauses.generate.near_text(
  query = query_vector, # your query vector goes here
  limit=5,
  return_metadata=wvc.query.MetadataQuery(distance=True),
  # include_vector=True, # retrieve the object vector also e.g. [-0.013081819750368595, -0.006620033644139767,...]
  single_prompt=prompt,
  target_vector="clause",
)

In [35]:
for o in response.objects:
  #print(o.properties)
  print(o.metadata.distance)
  print(o.metadata.explain_score)
  print(o.generated)
  print(o.vector)
  print("\n")

0.13241612911224365
None
The laws of the jurisdiction in which the Property is situated shall govern this Lease.

State found: None
{}


0.13544130325317383
None
This Agreement shall be governed by the laws of California.

State found: California
{}


0.13693416118621826
None
For purposes of this Section 4.02 , the term “applicable Law” or “applicable Laws” includes GDPR.

State found: None
{}


0.13982510566711426
None
This Contract shall be interpreted and enforced in accordance with the laws of the State of California.

State found: Florida
{}


0.13990384340286255
None
The Agreement and any disputes arising therefrom shall be governed by the laws of the State of New York.

State found: New York
{}




# Keyword search
Keyword search, also called "BM25 (Best match 25)" or "sparse vector"  returns objects that have the highest BM25F scores.

In [8]:
response = clauses.query.bm25(
    query="annualized",
    # Search on selected properties only and you can weight how much each property affects the overall BM25F score
    # This example boosts the clause property by a factor of 2 while the category property remains static.
    query_properties=["clause^2", "category"],
    limit=5,                                                          # Top n results
    return_metadata=wvc.query.MetadataQuery(distance=True,)            # OPTIONAL
)

In [9]:
for o in response.objects:
  print(o.properties)
  print(o.metadata.distance)

{'clause': 'Your base salary will be: $27,884,62 biweekly ($725,000 annualized).', 'category': 'Base Salary'}
None
{'clause': 'Your starting base annualized salary will be $400,000, paid on a bi-weekly basis.', 'category': 'Base Salary'}
None


# Hybrid search

1.   Combines results of a vector search and a keyword (BM25F) search
2.   



In [36]:
response = clauses.query.hybrid(
    query="withholdings",
    limit=5,
    # Use the metadata properties to understand why an object is selected - score=True, explain_score=True
    return_metadata=wvc.query.MetadataQuery(score=True, explain_score=True),
)

In [37]:
for o in response.objects:
  print(o.properties)
  print(o.metadata.explain_score)

{'clause': 'Withholding of all applicable taxes are your responsibility.', 'category': 'Withholdings'}

Hybrid (Result Set vector) Document 94b74e36-6934-4fc4-a0ed-ec1a84b05bce: original score 0.87360716, normalized score: 0.49988097 - 
Hybrid (Result Set keyword) Document 94b74e36-6934-4fc4-a0ed-ec1a84b05bce: original score 2.444701, normalized score: 0.5
{'clause': 'The Company shall withhold from any payments any deductions required by law.', 'category': 'Withholdings'}

Hybrid (Result Set vector) Document 0573af8e-3eed-4f03-97ed-9ee66976d0b6: original score 0.8707987, normalized score: 0.48603305 - 
Hybrid (Result Set keyword) Document 0573af8e-3eed-4f03-97ed-9ee66976d0b6: original score 2.444701, normalized score: 0.5
{'clause': 'Payments made hereunder shall be made subject to required governmental withholdings.', 'category': 'Withholdings'}

Hybrid (Result Set vector) Document 712cde82-a7a6-45c5-992c-5dad00d417ba: original score 0.8736313, normalized score: 0.5 - 
Hybrid (Result

# Balance keyword and vector search

1.   An alpha of 1 is a pure vector search.
2.   An alpha of 0 is a pure keyword search.


In [38]:
response = clauses.query.hybrid(
    query="withholdings",
    limit=5,
    alpha=0.25
)

In [40]:
for o in response.objects:
  print(o.properties)

{'clause': 'Withholding of all applicable taxes are your responsibility.', 'category': 'Withholdings'}
{'clause': 'The Company shall withhold from any payments any deductions required by law.', 'category': 'Withholdings'}
{'clause': 'Payment of Dividend Equivalents is subject to required tax withholding.', 'category': 'Tax Withholdings'}
{'clause': 'The payments under this Agreement shall be subject to applicable tax withholding.', 'category': 'Tax Withholdings'}
{'clause': 'This Award will be subject to all applicable taxes as provided in Section 14(d) of the Plan.', 'category': 'Tax Withholdings'}
