In [2]:
import pandas as pd
import os
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types import content
import json
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import time
from google.api_core import exceptions as google_exceptions

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

def process_text(file_content):
    GEMINI_API_KEY = "AIzaSyBAWdcP6Ydtrva1Pigy6n--RXFrY4pKcnw"
    genai.configure(api_key=GEMINI_API_KEY)


    model = genai.GenerativeModel('gemini-1.5-flash',
                              # Set the `response_mime_type` to output JSON
                              #generation_config={"response_mime_type": "application/json"}
                              safety_settings={
                                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    }
    )

    system_instruction="""
        You are to categorize court case rulings. 
        You must return at leats one value and can return multiple.
        Return a value of 'parenting' if the the court case is about family matters such as child custody and parenting matters.
        Return the value 'finance' if the court case is about the division of assets or other financial matters NOT related to the care of the children.
        If none of these 2 apply just return 'other'.
        Return Value(s) using this JSON schema:
        {
            "type": "object",
            "properties": {
              "context": {
              "type": "array",
              "items": {
                "type": "string"
              }
            }
        }
    """
    #response = model.generate_content(system_instruction + "\n\n" + file_content)

    try:
      response = model.generate_content(system_instruction + "\n\n" + file_content)
          
      if response.prompt_feedback.block_reason:
        print(f"Prompt blocked. Reason: {response.prompt_feedback.block_reason}")
        return None
          
      if not response.parts:
          print("No response parts returned.")
          return None
    
      response_text = response.text
      # Strip the unwanted prefixes and suffixes
      cleaned_text = response_text[8:-4]  # Remove the first 6 and last 3 characters
      json_response = json.loads(cleaned_text)

      print(json_response)
      return json_response
    except Exception as e:
      print(f"An error occurred while processing the file: {str(e)}")
      return None

# Test Code

In [180]:
GEMINI_API_KEY = "AIzaSyBAWdcP6Ydtrva1Pigy6n--RXFrY4pKcnw"
genai.configure(api_key=GEMINI_API_KEY)
genai.GenerationConfig(
    {"response_mime_type":"application/json"},
)

filename = "77.txt"
file_path = os.path.join("documents", filename)

if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()


model = genai.GenerativeModel('gemini-1.5-pro-latest',
                            # Set the `response_mime_type` to output JSON
                            # generation_config={"response_mime_type": "application/json"},
                            safety_settings={
                                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    }
)

system_instruction="""
    You are to categorize court case rulings. 
    You must return at leats one value and can return multiple.
    Return a value of 'parenting' if the the court case is about family matters such as child custody and parenting matters.
    Return the value 'finance' if the court case is about the division of assets or other financial matters NOT related to the care of the children.
    If none of these 2 apply just return 'other'.
    Return Value(s) using this JSON schema:
    {
        "type": "object",
        "properties": {
            "context": {
            "type": "array",
            "items": {
            "type": "string"
            }
        }
    }
"""
response = model.generate_content(system_instruction + "\n\n" + file_content )
#json_response = json.loads(response.text)
print(response.text)


KeyboardInterrupt: 

In [138]:
response.prompt_feedback

block_reason: OTHER

In [86]:
response_text = response.text

# Strip the unwanted prefixes and suffixes
cleaned_text = response_text[8:-4]  # Remove the first 6 and last 3 characters

json_response = json.loads(cleaned_text)
print(cleaned_text)
print(json_response)


{
    "context": [
        "parenting",
        "finance"
    ]
}
{'context': ['parenting', 'finance']}


# Init Empty String

In [174]:
# Load the CSV file
df = pd.read_csv('FC-and-FCA_meta.csv')

# Create a new column 'context'
df['category'] = None

rows_to_keep = []

# Run Loop

In [184]:
for index in range(935, 1474):  # Assuming 1473 is the last valid index
    if index == 1000:
        break
    filename = f"{index}.txt"
    file_path = os.path.join("documents", filename)

    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            file_content = file.read()
            context = process_text(file_content)
            
            if context is not None and 'context' in context:
                # Update the row only if valid context is returned
                df.at[index, 'category'] = context['context']
                rows_to_keep.append(df.loc[index])  # Append the entire row
            else:
                print(f"Skipping file {filename} due to invalid or None result")


            time.sleep(15)

{'context': ['parenting']}
{'context': ['parenting', 'finance']}
{'context': ['parenting', 'finance']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['parenting', 'finance']}
{'context': ['parenting']}
{'context': ['finance']}
{'context': ['parenting']}
{'context': ['parenting', 'parenting']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['finance']}
{'context': ['finance']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['parenting', 'finance']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['parenting', 'finance']}
{'context': ['parenting']}
{'context': ['finance']}
{'context': ['finance']}
{'context': ['parenting']}
{'context': ['parenting']}
{'context': ['finance']}
{'context': ['parenting', 'finance']}
{'context': ['parenting', 'finance']}
{'context': ['parenting']}
{'context': ['finance', 'parenting']}
{'context': ['parent

In [185]:
new_df = pd.DataFrame(rows_to_keep)

#new_df = new_df.drop_duplicates(subset=['href'])

new_df.to_csv('FC-and-FCA_meta_cleaned.csv', index=False)
new_df.tail()

Unnamed: 0.1,Unnamed: 0,href,text,category
992,992,/cgi-bin/viewdoc/au/cases/cth/FedCFamC2F/2023/...,Capan & Capan [2023] FedCFamC2F 683 (1 June 2023),"[finance, parenting]"
994,994,/cgi-bin/viewdoc/au/cases/cth/FedCFamC2F/2023/...,Fadel & Gibbons [2023] FedCFamC2F 679 (2 June ...,[parenting]
995,995,/cgi-bin/viewdoc/au/cases/cth/FedCFamC2F/2023/...,Jerome & Jerome [2023] FedCFamC2F 754 (2 June ...,"[finance, parenting]"
996,996,/cgi-bin/viewdoc/au/cases/cth/FedCFamC2F/2023/...,Pieters & Westmore (No 2) [2023] FedCFamC2F 70...,[parenting]
998,998,/cgi-bin/viewdoc/au/cases/cth/FedCFamC2F/2023/...,Letchford & Havel (No 2) [2023] FedCFamC2F 684...,[parenting]


block_reason: OTHER

# Extras

In [1]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv('FC-and-FCA_meta_0-512.csv')
df2 = pd.read_csv('FC-and-FCA_meta_1000-1400.csv')

# Combine the dataframes
combined_df = pd.concat([df1, df2])

# Save the combined dataframe to a new CSV file
combined_df.to_csv('FC-and-FCA_meta_combined.csv', index=False)

print("Files combined and saved successfully.")

Files combined and saved successfully.


Index(['Unnamed: 0', 'href', 'text', 'category'], dtype='object')