In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from sklearn.metrics import silhouette_score, calinski_harabasz_score

In [9]:
with open('data_preprocessing.ipynb', 'r', encoding='utf-8') as f:
    notebook = nbformat.read(f, as_version=4)

ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(notebook)

# Access variables from the executed notebook
global_vars = {}
for cell in notebook.cells:
    if cell.cell_type == 'code':
        exec(cell.source, global_vars)

# Access the DataFrame `df` from the preprocessing notebook
df = global_vars['df']
print("Loaded DataFrame from preprocessing notebook:")
print(df.head())

  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  
0  Ordinarily that discretion will be exercised s...  
1  The general principles governing the exercise ...  
2  Ordinarily that discretion will be exercised s...  
3  The general principles governing the exercise ...  
4  The preceding general principles inform the ex...  
Columns in the dataset: Index(['case_id', 'case_outcome', 'case_title', 'case_text'], dtype='object')
case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64
case_outcome

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                           case_text  \
0  Ordinarily that discretion will be exercised s...   
1  The general principles governing the exercise ...   
2  Ordinarily that discretion will be exercised s...   
3  The general principles governing the exercise ...   
4  The preceding general principles inform the ex...   

                                        cleaned_text  
0  ordinarily discretion exercised cost follow ev...  
1  general principle governing exercise discretio...  
2  ordinarily discretion exercised cost follow ev...  
3  general principle governing exercise discretio...  
4  preceding general principle inform exercise di...  
                                        cleaned_text case_category
0  ordinarily discretion exercised cost follow ev...         other
1  general principle governing exercise discretio...         other
2  ordinarily discretion exercised cost follow ev...         other
3  general principle governing exercise discretio...         othe

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Loaded DataFrame from preprocessing notebook:
  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  \
0  Ordinarily that discretion will be exercised s...   
1  The general principles governing the exercise ...   
2  Ordinarily that discretion will be exercised s...   
3  The general principles governing the exercise ...   
4  The preceding general principles inform the ex...   

                                        cleaned_text case_category  
0  ordinarily discretion exercised cost follow ev...         other  
1  general princ

In [5]:
keywords = {
    'family': [
        'children', 'custody', 'divorce', 'marriage', 'adoption', 
        'parenting orders', 'child support', 'spousal maintenance',
        'family violence', 'guardianship', 'prenuptial agreements'
    ],
    'property': [
        'property', 'ownership', 'land', 'real estate', 'lease',
        'easements', 'mortgages', 'foreclosure', 'zoning',
        'landlord', 'tenant', 'eviction'
    ],
    'criminal': [
        'theft', 'murder', 'assault', 'fraud', 'crime',
        'sentencing', 'bail', 'parole', 'prosecution',
        'homicide', 'robbery', 'drug offenses'
    ],
    'business': [
        'contract', 'agreement', 'corporation', 
        'partnership', 'mergers', 'franchises',
        'intellectual property', 'trade practices'
    ],
    'financial_and_securities': [
        'securities', 'investments', 
        'insider trading', 
        'market manipulation',
        'financial services'
    ],
    'administrative': [
        "judicial review", "government decisions", "statutory interpretation"
    ],
    "employment": ["workers comp"]
}

In [6]:
# Generate TF-IDF embeddings for the cleaned text
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])

# Ensure the `keywords` variable is accessible
if 'keywords' not in globals():
    raise NameError("The variable 'keywords' is not defined. Please ensure it is defined in a previous cell.")

# Perform K-Means clustering
num_clusters = len(keywords)  # Number of predefined categories
kmeans = KMeans(n_clusters=num_clusters, random_state=777)
df['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Map clusters to predefined categories
cluster_to_category = {}
for cluster in range(num_clusters):
    # Get the documents in the current cluster
    cluster_docs = df[df['cluster'] == cluster]['cleaned_text']
    
    # Analyze the cluster's dominant keywords
    cluster_keywords = ' '.join(cluster_docs).split()
    for category, words in keywords.items():
        if any(word in cluster_keywords for word in words):
            cluster_to_category[cluster] = category
            break
    else:
        cluster_to_category[cluster] = 'other'  # Default if no match

# Assign categories based on the cluster mapping
df['case_category'] = df['cluster'].map(cluster_to_category)

# Display the clustered dataset
print(df[['cleaned_text', 'cluster', 'case_category']].head())

# Save the clustered dataset to a CSV file
output_file = 'legalData/clustered_legal_text.csv'
df.to_csv(output_file, index=False)

print(f"Clustered dataset saved to {output_file}")



                                        cleaned_text  cluster case_category
0  ordinarily discretion exercised cost follow ev...        1        family
1  general principle governing exercise discretio...        2        family
2  ordinarily discretion exercised cost follow ev...        1        family
3  general principle governing exercise discretio...        2        family
4  preceding general principle inform exercise di...        1        family
Clustered dataset saved to legalData/clustered_legal_text.csv


In [12]:
file_path = 'legalData/clustered_legal_text.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print("Loaded clustered dataset:")
print(df.head())

if 'cluster' not in df.columns:
	raise KeyError("'cluster' column is missing in the DataFrame. Ensure the clustering step has been executed successfully.")

silhouette_avg = silhouette_score(tfidf_matrix, df['cluster'])
print(f"Silhouette Score: {silhouette_avg:.2f}")

# Calculate Calinski-Harabasz Index
calinski_harabasz = calinski_harabasz_score(tfidf_matrix.toarray(), df['cluster'])
print(f"Calinski-Harabasz Index: {calinski_harabasz:.2f}")

Loaded clustered dataset:
  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  \
0  Ordinarily that discretion will be exercised s...   
1  The general principles governing the exercise ...   
2  Ordinarily that discretion will be exercised s...   
3  The general principles governing the exercise ...   
4  The preceding general principles inform the ex...   

                                        cleaned_text case_category  cluster  
0  ordinarily discretion exercised cost follow ev...        family        1  
1  general princip