In [1]:
import numpy as np # linear algebra
import pandas as pd
import seaborn as sns

import os

import faiss

import tensorflow as tf

import tensorflow_hub as hub

from pprint import pprint

from nltk.corpus import stopwords, words as english_words

import openai
openai.api_key = os.environ["OPENAI_API_KEY"]



In [2]:
legal_docs = pd.read_csv("./data/legal_text_classification.csv")
legal_docs.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [52]:
legal_docs["case_outcome"].unique()

array(['cited', 'applied', 'followed', 'referred to', 'related',
       'considered', 'discussed', 'distinguished', 'affirmed', 'approved'],
      dtype=object)

In [53]:
legal_docs.shape

(24985, 4)

In [3]:
pprint (legal_docs.iloc[100]["case_text"])

('Gedeon v Commissioner of New South Wales Crime Commission [2008] HCA 43 ; '
 '(2008) 82 ALJR 1465 at [43] the High Court said: The expression '
 '"jurisdictional fact" was used somewhat loosely in the course of '
 'submissions. Generally the expression is used to identify a criterion the '
 'satisfaction of which enlivens the exercise of the statutory power or '
 'discretion in question. If the criterion be not satisfied then the decision '
 'purportedly made in exercise of the power or discretion will have been made '
 'without the necessary statutory authority required of the decision maker.')


In [4]:
print (legal_docs.info())
legal_docs.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       24985 non-null  object
 1   case_outcome  24985 non-null  object
 2   case_title    24985 non-null  object
 3   case_text     24809 non-null  object
dtypes: object(4)
memory usage: 780.9+ KB
None


case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64

In [5]:
# impute missing text with title
legal_docs["case_text"] = np.where(legal_docs["case_text"].isna(), legal_docs["case_title"], legal_docs["case_text"])

In [6]:
legal_docs.isna().sum()

case_id         0
case_outcome    0
case_title      0
case_text       0
dtype: int64

In [7]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [8]:
text_all = legal_docs["case_text"].to_list()
text_partial = text_all[:10000]
text_embeddings = embed(text_partial)

In [16]:
text_embeddings

<tf.Tensor: shape=(10000, 512), dtype=float32, numpy=
array([[ 0.03102933, -0.068979  , -0.04295269, ...,  0.01397307,
        -0.01894638, -0.07713135],
       [ 0.01482124, -0.06870302, -0.03530958, ..., -0.01106551,
         0.02936252,  0.02105497],
       [ 0.03102933, -0.068979  , -0.04295269, ...,  0.01397307,
        -0.01894638, -0.07713135],
       ...,
       [-0.0264187 , -0.00344109, -0.0454563 , ...,  0.05053573,
        -0.04314136, -0.06131158],
       [-0.01808256,  0.00948057, -0.03791722, ...,  0.0233391 ,
         0.01767599, -0.07593904],
       [-0.01808256,  0.00948057, -0.03791722, ...,  0.0233391 ,
         0.01767599, -0.07593904]], dtype=float32)>

In [17]:
text_embeddings_np = text_embeddings.numpy()
index = faiss.IndexFlatL2(text_embeddings_np.shape[1])
index.add(text_embeddings_np)

In [34]:
#query1 = "Lawsuits involving harm to another human being."
#query1 = "Native tribes lawsuits involving land claims."
#query1 = "lawsuits where the plaintiff won"
query1 = "Court cases involving chemicals"
#query1 = "lawsuits involving multiple plaintiffs"


# Encode the query into an embedding
query_embedding = embed([query1]).numpy()

# Search for similar embeddings in the index
D, I = index.search(query_embedding, k=5)  # Retrieve the top 5 similar documents

# Print the indices and distances of the similar documents
print("Top 5 similar documents:")
indices = []
for i, d in zip(I[0], D[0]):
    print(f"Index: {i}, Similarity Score: {1 - d}")
    indices.append(i)


Top 5 similar documents:
Index: 4487, Similarity Score: -0.3535158634185791
Index: 3283, Similarity Score: -0.4114201068878174
Index: 7297, Similarity Score: -0.4365910291671753
Index: 9453, Similarity Score: -0.4687044620513916
Index: 3871, Similarity Score: -0.4746510982513428


In [28]:
idx = 8216

pprint (legal_docs.iloc[idx]["case_title"])
pprint (text_partial[idx])


'Roussel Uclaf Imperial Chemical Industries plc [1990] RPC 45'
('I should also refer to Roussel Uclaf Imperial Chemical Industries plc [1990] '
 'RPC 45 a case which followed Warner-Lambert Co v Glaxo Laboratories Ltd. The '
 'question there was whether trade secrets should be disclosed to two patent '
 "attorneys employed by the plaintiff. Both worked in the plaintiff's Paris "
 'office and were responsible for both the litigation in England and parallel '
 'litigation in France. The patent attorneys were permitted access to the '
 'confidential information but on terms that the plaintiff undertook that the '
 'attorneys "should not be concerned in any way with the French proceedings" '
 'and that the plaintiff would "pay to the defendants any sum that the court '
 'shall decide the defendants may have suffered by any wrongful disclosure of '
 'the secret process". The first condition was imposed to "minimise the chance '
 'that the confidential information [would] be used in [the Fre

In [21]:
def summarize_text(text, num_sentences=5, english_level=9):
  response = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0,
    messages = [{"role":"system", "content": f"Follow these instructions when writing the summary:\
      \n1. Write a clear and concise summary consisting of {num_sentences} sentences \
      \n2. The summary's english level matches that of a person with {english_level} years of education \
      \n3. The summary should consist of an explanation of what the case is about, who's involved and the outcome"
      },
      {"role": "user", "content":f"Write a summary of the following text:{text}"}]
    )

  return response['choices'][0]['message']['content']


In [24]:
summary = summarize_text(text_partial[idx], num_sentences=3, english_level=4)

In [25]:
pprint (summary)

('The case of Roussel Uclaf Imperial Chemical Industries plc [1990] RPC 45 '
 'involved the question of whether trade secrets should be disclosed to two '
 'patent attorneys employed by the plaintiff. The attorneys were responsible '
 'for litigation in both England and France, but the plaintiff wanted to '
 'ensure that the confidential information would not be used in the French '
 'proceedings. The court imposed conditions on the disclosure, including that '
 'the attorneys would not be involved in the French litigation and that the '
 'plaintiff would compensate the defendants for any wrongful disclosure.')


In [47]:
def answer_question(question, text, num_sentences=5, english_level=9):
  response = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0,
    messages = [{"role":"system", "content": f"Follow these instructions when writing the summary:\
      \n1. Write a clear and concise summary consisting of {num_sentences} sentences \
      \n2. The summary's english level matches that of a person with {english_level} years of education \
      \n3. The summary should consist of an explanation of what the case is about, who's involved and the outcome \
      \n4. Different texts are separated by ***"
      },
      {"role": "user", "content":f"Answer the following question about the text:{question}. text:{text}"}]
    )

  return response['choices'][0]['message']['content']

In [32]:
pprint (answer_question("What is this lawsuit all about?", text_partial[idx]))

('This lawsuit is about whether trade secrets should be disclosed to two '
 'patent attorneys employed by the plaintiff. The case involves Roussel Uclaf '
 'Imperial Chemical Industries plc and Warner-Lambert Co v Glaxo Laboratories '
 'Ltd. The question at hand is whether the patent attorneys should have access '
 'to the confidential information for litigation purposes. The outcome of the '
 'case is not mentioned in the text.')


In [46]:
texts = []
for i in indices:
  texts.append(text_partial[i])

text_all = "***".join(texts)

In [51]:
pprint (answer_question("Who won in each of these cases?", text_all))

('In the case of Rogers v The Queen [1994] HCA 42, the outcome was not '
 'mentioned. In the case of Adams v Lambert [2006] HCA 10, the High Court set '
 'aside the orders made by Gyles J on July 1, 2004 and remitted the matter to '
 'a Judge of the Federal Court of Australia for further hearing. In the case '
 'of Cardile v LED Builders Pty Ltd [1999] HCA 18, the Federal Court has the '
 'power to grant an order in the nature of a Mareva order, as established by '
 'previous cases. In the case of Saccharin Corporation Ltd v Anglo-Continental '
 'Chemical Works Ltd (1900) 17 RPC 307, the court held that a method claim was '
 'infringed even if changes were made, as long as the same result was '
 'achieved. In the case of Ainsworth v Criminal Justice Commission [1992] HCA '
 '10, the Federal Court has the jurisdiction to make binding declarations of '
 'right, even if no consequential relief is claimed.')
