In [1]:
import numpy as np # linear algebra
import pandas as pd
import seaborn as sns

import os

import faiss

import tensorflow as tf

import tensorflow_hub as hub

from pprint import pprint

from nltk.corpus import stopwords, words as english_words

import openai
openai.api_key = os.environ["OPENAI_API_KEY"]



In [2]:
legal_docs = pd.read_csv("./data/legal_text_classification.csv")
legal_docs.head()

Unnamed: 0,case_id,case_outcome,case_title,case_text
0,Case1,cited,Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...,Ordinarily that discretion will be exercised s...
1,Case2,cited,Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...,The general principles governing the exercise ...
2,Case3,cited,Colgate Palmolive Co v Cussons Pty Ltd (1993) ...,Ordinarily that discretion will be exercised s...
3,Case4,cited,Dais Studio Pty Ltd v Bullett Creative Pty Ltd...,The general principles governing the exercise ...
4,Case5,cited,Dr Martens Australia Pty Ltd v Figgins Holding...,The preceding general principles inform the ex...


In [3]:
pprint (legal_docs.iloc[100]["case_text"])

('Gedeon v Commissioner of New South Wales Crime Commission [2008] HCA 43 ; '
 '(2008) 82 ALJR 1465 at [43] the High Court said: The expression '
 '"jurisdictional fact" was used somewhat loosely in the course of '
 'submissions. Generally the expression is used to identify a criterion the '
 'satisfaction of which enlivens the exercise of the statutory power or '
 'discretion in question. If the criterion be not satisfied then the decision '
 'purportedly made in exercise of the power or discretion will have been made '
 'without the necessary statutory authority required of the decision maker.')


In [4]:
print (legal_docs.info())
legal_docs.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   case_id       24985 non-null  object
 1   case_outcome  24985 non-null  object
 2   case_title    24985 non-null  object
 3   case_text     24809 non-null  object
dtypes: object(4)
memory usage: 780.9+ KB
None


case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64

In [5]:
# impute missing text with title
legal_docs["case_text"] = np.where(legal_docs["case_text"].isna(), legal_docs["case_title"], legal_docs["case_text"])

In [6]:
legal_docs.isna().sum()

case_id         0
case_outcome    0
case_title      0
case_text       0
dtype: int64

In [7]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [8]:
text_all = legal_docs["case_text"].to_list()
text_partial = text_all[:10000]
text_embeddings = embed(text_partial)

In [15]:
text_embeddings

<tf.Tensor: shape=(10000, 512), dtype=float32, numpy=
array([[ 0.03102933, -0.068979  , -0.04295269, ...,  0.01397307,
        -0.01894638, -0.07713135],
       [ 0.01482124, -0.06870302, -0.03530958, ..., -0.01106551,
         0.02936252,  0.02105497],
       [ 0.03102933, -0.068979  , -0.04295269, ...,  0.01397307,
        -0.01894638, -0.07713135],
       ...,
       [-0.0264187 , -0.00344109, -0.0454563 , ...,  0.05053573,
        -0.04314136, -0.06131158],
       [-0.01808256,  0.00948057, -0.03791722, ...,  0.0233391 ,
         0.01767599, -0.07593904],
       [-0.01808256,  0.00948057, -0.03791722, ...,  0.0233391 ,
         0.01767599, -0.07593904]], dtype=float32)>

In [10]:
text_embeddings_np = text_embeddings.numpy()
index = faiss.IndexFlatL2(text_embeddings_np.shape[1])
index.add(text_embeddings_np)

In [50]:
#query1 = "Lawsuits involving harm to another human being."
query1 = "Native tribes lawsuits involving land claims."

# Encode the query into an embedding
query_embedding = embed([query1]).numpy()

# Search for similar embeddings in the index
D, I = index.search(query_embedding, k=5)  # Retrieve the top 5 similar documents

# Print the indices and distances of the similar documents
print("Top 5 similar documents:")
for i, d in zip(I[0], D[0]):
    print(f"Index: {i}, Similarity Score: {1 - d}")


Top 5 similar documents:
Index: 2919, Similarity Score: -0.25680971145629883
Index: 8959, Similarity Score: -0.2823491096496582
Index: 9668, Similarity Score: -0.28590452671051025
Index: 9531, Similarity Score: -0.2877476215362549
Index: 9532, Similarity Score: -0.2877476215362549


In [51]:
idx = 2919

pprint (legal_docs.iloc[idx]["case_title"])
pprint (text_partial[idx])


('Hillig as Administrator of Worimi Local Aboriginal Land Council v Minister '
 'for Lands for the State of New South Wales [2006] FCA 61')
('I refused an application by Mr and Mrs Parkinson for joinder to the Hillig '
 'proceedings on the basis of their ownership of land adjoining the Port '
 'Stephens land ( Hillig as Administrator of Worimi Local Aboriginal Land '
 'Council v Minister for Lands for the State of New South Wales [2006] FCA '
 '61). Worimi did not seek to be joined to the Hillig proceedings within the '
 'relevant period of notification of the non-claimant proceedings (see [39] '
 'below). The original basis for joinder was as the applicant authorised by '
 'the women who assert a native title interest. Worimi contends, and the '
 "original Form 1 claimant application ('the original application') states, "
 'that the Port Stephens land is a sacred site for women, in the context of '
 'childbirth. If Worimi has a claim to native title over the Port Stephens '
 'land on 

In [46]:
def summarize_text(text, num_sentences=5, english_level=9):
  response = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0,
    messages = [{"role":"system", "content": f"Follow these instructions when writing the summary:\
      \n1. Write a clear and concise summary consisting of {num_sentences} sentences \
      \n2. The summary's english level matches that of a person with {english_level} years of education \
      \n3. The summary should consist of an explanation of what the case is about, who's involved and the outcome"
      },
      {"role": "user", "content":f"Write a summary of the following text:{text}"}]
    )

  return response['choices'][0]['message']['content']


In [52]:
summary = summarize_text(text_partial[idx])

In [53]:
pprint (summary)

('Mr and Mrs Parkinson applied to join the Hillig proceedings, but their '
 'application was refused. The basis for their joinder was their ownership of '
 'land adjoining the Port Stephens land. Worimi, the applicant authorized by '
 'the women asserting a native title interest, did not seek to be joined to '
 'the Hillig proceedings within the relevant period of notification. Worimi '
 'claims that the Port Stephens land is a sacred site for women in the context '
 'of childbirth. However, Mr Hillig argues that if no valid assertion of '
 'native title can be made, there is no reason to join him to the proceedings. '
 'The original application was filed before Worimi had legal advice and stated '
 'that the native title claim group consists of the female members of the '
 'Garuahgal people descended from Mary Mahr.')
