In [3]:
import pandas as pd
import faiss
import os

import tensorflow_hub as hub

from pprint import pprint

import openai
openai.api_key = os.environ["OPENAI_API_KEY"]

In [6]:
legal_docs = pd.read_csv("../data/legal_text_classification.csv")
index = faiss.read_index("non_chunked_court_text.index")

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [55]:
#query1 = "claims related to shoe companies"
#query1 = "claims involving large sums of money"
#query1 = "claims involving boats"
#query1 = "claims involving injuries or death"
query1 = "claims involving sexual harrasment"

# Encode the query into an embedding
query_embedding = embed([query1]).numpy()

# Search for similar embeddings in the index
D, I = index.search(query_embedding, k=5)  # Retrieve the top 5 similar documents

# Print the indices and distances of the similar documents
print("Top 5 similar documents:")
indices = []
for i, d in zip(I[0], D[0]):
    print(f"Index: {i}, Similarity Score: {1 - d}")
    indices.append(i)

Top 5 similar documents:
Index: 21879, Similarity Score: -0.03011620044708252
Index: 22935, Similarity Score: -0.07395720481872559
Index: 6362, Similarity Score: -0.14229393005371094
Index: 6358, Similarity Score: -0.14308977127075195
Index: 6381, Similarity Score: -0.14308977127075195


In [56]:
idx = 21879

pprint (legal_docs.iloc[idx]["case_title"])
pprint (legal_docs.iloc[idx]["case_text"])

'Hall v A &amp; A Sheiban Pty Ltd [1989] FCA 72 ; (1989) 20 FCR 217'
('Ms Huang does not contend that the Federal Magistrate was in error in his '
 'summary of the principles applicable to conduct in breach of s 28A(1)(b). '
 'His Honour recognised the breadth of actions that can constitute conduct of '
 'a sexual nature and that single incident could constitute a contravention of '
 's 28A. He also considered the distinction, pointed out by Wilcox J in Hall v '
 'A &amp; A Sheiban Pty Ltd [1989] FCA 72 ; (1989) 20 FCR 217 at 247, that '
 "'[u] nwelcome sexual conduct may be insensitive, even offensive, but it does "
 "not necessarily constitute sexual harassment ', the latter suggesting "
 'repetition. 37 The allegation of sexual harassment made to HREOC against '
 "Professor Winder is that he touched Ms Huang's hip while passing her in a "
 'corridor in the second semester of 1999. The allegation against Professor '
 "Winder is described by Ms Huang in her documentation in the appeal

In [19]:
def answer_question(question, text, num_sentences=5, english_level=9):
  response = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0,
    messages = [{"role":"system", "content": f"Follow these instructions when writing the answer:\
      \n1. Different texts in the input are separated by ***"
      },
      {"role": "user", "content":f"Answer the following question about the text:{question}. text:{text}"}]
    )

  return response['choices'][0]['message']['content']


def summarize_text(text, num_sentences=5, english_level=9):
  response = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0,
    messages = [{"role":"system", "content": f"Follow these instructions when writing the summary:\
      \n1. Write a clear and concise summary consisting of {num_sentences} sentences \
      \n2. The summary's english level matches that of a person with {english_level} years of education \
      \n3. The summary should consist of an explanation of what the case is about, who's involved and the outcome"
      },
      {"role": "user", "content":f"Write a summary of the following text:{text}"}]
    )

  return response['choices'][0]['message']['content']

In [57]:
query_docs_list = legal_docs.iloc[indices]["case_text"].tolist()
texts = []
for query_doc in query_docs_list:
  texts.append(query_doc)

text_all = "***".join(texts)


In [58]:
pprint (answer_question("What's common in all of these cases?", text_all))

('In all of these cases, the common factor is that the individuals are making '
 'allegations of sexual harassment. They are claiming that they have '
 'experienced unwelcome sexual advances, requests for sexual favors, or other '
 'unwelcome conduct of a sexual nature. They are seeking relief and claiming '
 'that their rights under the Sex Discrimination Act have been violated.')


In [59]:
pprint (answer_question("What's common to all the companies mentioned in this text?", text_all))


('The commonality among all the companies mentioned in this text is that they '
 'are all involved in legal proceedings related to allegations of sexual '
 'harassment or discrimination.')


In [60]:
pprint (answer_question("What's the common industry for all the companies in these texts?", text_all))


('The common industry for all the companies in these texts is the legal '
 'industry.')


In [62]:
pprint (answer_question("List the companies and industries you find in these texts", text_all))


'Companies: \n- A & A Sheiban Pty Ltd\n- TAFE\n\nIndustries: \n- Education'


In [63]:
pprint (answer_question("What types of injuries are described in the text?", text_all))

('The text does not describe any specific injuries. It focuses on allegations '
 'of sexual harassment and the legal proceedings surrounding those '
 'allegations.')


In [64]:
text = legal_docs.iloc[idx]["case_text"]
pprint (answer_question("What is this document about? keep the answer at 2 sentences", text))


('This document is about a case involving allegations of sexual harassment '
 'made by Ms Huang against Professor Winder. The document discusses the '
 "incident in question, Ms Huang's changing accounts of the incident, and the "
 'consequences she alleges resulted from the alleged harassment.')


In [33]:
pprint (answer_question("What type of company are mentioned in the text and what do they do?", text))


('The text mentions two companies: Lockwood Security Products Pty Ltd and '
 'Doric Products Pty Ltd. Lockwood Security Products Pty Ltd is mentioned in '
 'the context of a court case where their observations were cited with '
 'approval by the High Court. Doric Products Pty Ltd is mentioned as the '
 'opposing party in the court case. The text does not provide specific '
 'information about what these companies do.')


In [26]:
pprint (summarize_text(text, english_level=5))


('The case mentioned is World Brands Management Pty Ltd v Cube Footwear Pty '
 'Ltd [2004] FCA 769. The judge, Heerey J, refers to a previous case, Dr '
 'Martens Australia Pty Ltd v Figgins Holdings Pty Ltd (1999) 44 IPR 281, but '
 'does not agree that it establishes a rule of law for the retail footwear '
 'trade. However, Heerey J finds value in the observations made in the '
 'previous case, as they align with personal experiences of how shoes are '
 'typically purchased. Shoes are not usually bought impulsively, but rather, '
 'consumers compare, try on, and think over their options. Shoes are expected '
 'to last for a few years, and if not chosen well, they can cause discomfort '
 'and regret. The outcome of the case is not mentioned in the summary.')


In [42]:
pprint (answer_question("What is the meaning of life?", text_all))


'The meaning of life is not addressed in the given text.'
