In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('legal_text_classification.csv')

# Explore the dataset
print("Dataset head:\n", df.head())
print("\nDataset Info:\n", df.info())
print("\nUnique `case_outcome` classes:", df['case_outcome'].unique())

Dataset head:
   case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  
0  Ordinarily that discretion will be exercised s...  
1  The general principles governing the exercise ...  
2  Ordinarily that discretion will be exercised s...  
3  The general principles governing the exercise ...  
4  The preceding general principles inform the ex...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24985 entries, 0 to 24984
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 


In [4]:
import re
import string

def clean_text(text):
    # Ensure the input is a string
    text = str(text) if pd.notnull(text) else ''
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuation
    text = re.sub(f"[{string.punctuation}]", "", text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    return text

df['clean_text'] = df['case_text'].apply(clean_text)

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['clean_text'])
y = df['case_outcome']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

In [8]:
y_pred = log_reg.predict(X_test)
print("\nLogistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Logistic Regression
Accuracy: 0.5321192715629378

Classification Report:
                precision    recall  f1-score   support

     affirmed       0.50      0.06      0.11        32
      applied       0.39      0.07      0.12       515
     approved       0.00      0.00      0.00        19
        cited       0.55      0.92      0.68      2457
   considered       0.32      0.06      0.10       324
    discussed       0.39      0.06      0.11       205
distinguished       0.70      0.06      0.11       122
     followed       0.44      0.10      0.16       436
  referred to       0.50      0.32      0.39       859
      related       1.00      0.04      0.07        28

     accuracy                           0.53      4997
    macro avg       0.48      0.17      0.19      4997
 weighted avg       0.50      0.53      0.45      4997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Random Forest Classifier Training (Alternative Model)
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_clf.fit(X_train, y_train)

In [None]:
# Evaluate Random Forest
rf_pred = rf_clf.predict(X_test)
print("\nRandom Forest Classifier")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("\nClassification Report:\n", classification_report(y_test, rf_pred))