In [30]:
# --- Import Libraries ---
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import os
import joblib

In [31]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# --- Load Dataset ---
file_path = r"C:\Users\yoshi\OneDrive\Desktop\CSMaster\CS439\FInalProj\legalData\legal_text_classification.csv"
df = pd.read_csv(file_path)

# Display dataset info
print("Dataset loaded:")
print(df.head())
print("Columns:", df.columns)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Dataset loaded:
  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  
0  Ordinarily that discretion will be exercised s...  
1  The general principles governing the exercise ...  
2  Ordinarily that discretion will be exercised s...  
3  The general principles governing the exercise ...  
4  The preceding general principles inform the ex...  
Columns: Index(['case_id', 'case_outcome', 'case_title', 'case_text'], dtype='object')


In [32]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    return ''

df['cleaned_text'] = df['case_text'].apply(preprocess_text)
print("Preprocessed text:")
print(df[['case_text', 'cleaned_text']].head())


Preprocessed text:
                                           case_text  \
0  Ordinarily that discretion will be exercised s...   
1  The general principles governing the exercise ...   
2  Ordinarily that discretion will be exercised s...   
3  The general principles governing the exercise ...   
4  The preceding general principles inform the ex...   

                                        cleaned_text  
0  ordinarily discretion exercised cost follow ev...  
1  general principle governing exercise discretio...  
2  ordinarily discretion exercised cost follow ev...  
3  general principle governing exercise discretio...  
4  preceding general principle inform exercise di...  


In [33]:
keywords = {
    'family': ['children', 'custody', 'divorce', 'marriage', 'adoption', 'parenting orders', 'child support', 'spousal maintenance', 'family violence', 'guardianship', 'prenuptial agreements'],
    'property': ['property', 'ownership', 'land', 'real estate', 'lease', 'easements', 'mortgages', 'foreclosure', 'zoning', 'landlord', 'tenant', 'eviction'],
    'criminal': ['theft', 'murder', 'assault', 'fraud', 'crime', 'sentencing', 'bail', 'parole', 'prosecution', 'homicide', 'robbery', 'drug offenses'],
    'business': ['contract', 'agreement', 'corporation', 'partnership', 'mergers', 'franchises', 'intellectual property', 'trade practices'],
    'financial_and_securities': ['securities', 'investments', 'insider trading', 'market manipulation', 'financial services'],
    'administrative': ['judicial review', 'government decisions', 'statutory interpretation'],
    'employment': ['workers comp']
}

def assign_category(text):
    for category, words in keywords.items():
        if any(word in text.lower() for word in words):
            return category
    return 'other'

df['case_category'] = df['cleaned_text'].apply(assign_category)
print("Assigned categories:")
print(df[['cleaned_text', 'case_category']].head())


Assigned categories:
                                        cleaned_text case_category
0  ordinarily discretion exercised cost follow ev...         other
1  general principle governing exercise discretio...         other
2  ordinarily discretion exercised cost follow ev...         other
3  general principle governing exercise discretio...         other
4  preceding general principle inform exercise di...         other


In [34]:
output_dir = 'legalData'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'cleaned_legal_text.csv')
df[['case_text', 'cleaned_text']].to_csv(output_file, index=False)
print(f"Cleaned text saved to {output_file}")

Cleaned text saved to legalData\cleaned_legal_text.csv


In [41]:
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])

num_clusters = len(keywords)
kmeans = KMeans(n_clusters=num_clusters, random_state=777)
df['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Map clusters to categories
cluster_to_category = {}
for cluster in range(num_clusters):
    cluster_docs = df[df['cluster'] == cluster]['cleaned_text']
    cluster_keywords = ' '.join(cluster_docs).split()
    category_counts = {category: sum(word in cluster_keywords for word in words) for category, words in keywords.items()}
    cluster_to_category[cluster] = max(category_counts, key=category_counts.get) if category_counts else 'other'


df['case_category'] = df['cluster'].map(cluster_to_category)
print("Clustered dataset:")
print(df[['cleaned_text', 'cluster', 'case_category']].head(20))




Clustered dataset:
                                         cleaned_text  cluster case_category
0   ordinarily discretion exercised cost follow ev...        1      property
1   general principle governing exercise discretio...        2      criminal
2   ordinarily discretion exercised cost follow ev...        1      property
3   general principle governing exercise discretio...        2      criminal
4   preceding general principle inform exercise di...        1      property
5   accept making rolled offer inclusive cost inte...        2      criminal
6   preceding general principle inform exercise di...        1      property
7   question level unreasonableness necessary attr...        3      criminal
8   recent decision high court australian broadcas...        3      criminal
9   hexal australia pty ltd v roche therapeutic in...        2      criminal
10  hexal australia pty ltd v roche therapeutic in...        1      property
11  quia timet proceeding court regard degree prob...    

In [42]:
silhouette_avg = silhouette_score(tfidf_matrix, df['cluster'])
calinski_harabasz = calinski_harabasz_score(tfidf_matrix.toarray(), df['cluster'])
print(f"Silhouette Score: {silhouette_avg:.2f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.2f}")

Silhouette Score: 0.01
Calinski-Harabasz Index: 192.60


In [43]:
for cluster in range(len(keywords)):
    print(f"Cluster {cluster}:")
    print(df[df['cluster'] == cluster][['cleaned_text', 'case_category']].head())

Cluster 0:
                                          cleaned_text case_category
682  difference created aural comparison stress pro...      property
683  function trade mark give indication purchaser ...      property
684  australian cooperative food v norco cooperativ...      property
685  principle well established stated dixon mctier...      property
687  view health plus trade mark deceptively simila...      property
Cluster 1:
                                         cleaned_text case_category
0   ordinarily discretion exercised cost follow ev...      property
2   ordinarily discretion exercised cost follow ev...      property
4   preceding general principle inform exercise di...      property
6   preceding general principle inform exercise di...      property
10  hexal australia pty ltd v roche therapeutic in...      property
Cluster 2:
                                         cleaned_text case_category
1   general principle governing exercise discretio...      criminal
3   gener

In [44]:
output_file = 'legalData/clustered_legal_text.csv'
df.to_csv(output_file, index=False)
print(f"Clustered dataset saved to {output_file}")

Clustered dataset saved to legalData/clustered_legal_text.csv


In [45]:
model_file = 'legal_text_classification_model.pkl'
joblib.dump(pipeline, model_file)
print(f"Model saved to {model_file}")

Model saved to legal_text_classification_model.pkl
