In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline
import os


In [69]:
# Load the dataset
df = pd.read_csv(r"C:\Users\yoshi\OneDrive\Desktop\CSMaster\CS439\FInalProj\legalData\legal_text_classification.csv")

# Display the first few rows
print(df.head())

# Print the column names to verify
print("Columns in the dataset:", df.columns)

# Check for missing values
print(df.isnull().sum())

# Check the distribution of case outcomes
if 'case_outcome' in df.columns:
    print(df['case_outcome'].value_counts())
else:
    print("The 'case_outcome' column is not present in the dataset.") 
    

  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  
0  Ordinarily that discretion will be exercised s...  
1  The general principles governing the exercise ...  
2  Ordinarily that discretion will be exercised s...  
3  The general principles governing the exercise ...  
4  The preceding general principles inform the ex...  
Columns in the dataset: Index(['case_id', 'case_outcome', 'case_title', 'case_text'], dtype='object')
case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64
case_outcome

In [70]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize text
        tokens = word_tokenize(text)
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        # Join tokens back into a string
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to the case_text column
df['cleaned_text'] = df['case_text'].apply(preprocess_text)

# Display the cleaned text
print(df[['case_text', 'cleaned_text']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                           case_text  \
0  Ordinarily that discretion will be exercised s...   
1  The general principles governing the exercise ...   
2  Ordinarily that discretion will be exercised s...   
3  The general principles governing the exercise ...   
4  The preceding general principles inform the ex...   

                                        cleaned_text  
0  ordinarily discretion exercised cost follow ev...  
1  general principle governing exercise discretio...  
2  ordinarily discretion exercised cost follow ev...  
3  general principle governing exercise discretio...  
4  preceding general principle inform exercise di...  


In [71]:
keywords = {
    'family': [
        'children', 'custody', 'divorce', 'marriage', 'adoption', 
        'parenting orders', 'child support', 'spousal maintenance',
        'family violence', 'guardianship', 'prenuptial agreements'
    ],
    'property': [
        'property', 'ownership', 'land', 'real estate', 'lease',
        'easements', 'mortgages', 'foreclosure', 'zoning',
        'landlord', 'tenant', 'eviction'
    ],
    'criminal': [
        'theft', 'murder', 'assault', 'fraud', 'crime',
        'sentencing', 'bail', 'parole', 'prosecution',
        'homicide', 'robbery', 'drug offenses'
    ],
    'business': [
        'contract', 'agreement', 'corporation', 
        'partnership', 'mergers', 'franchises',
        'intellectual property', 'trade practices'
    ],
    'financial_and_securities': [
        'securities', 'investments', 
        'insider trading', 
        'market manipulation',
        'financial services'
    ],
    'administrative': [
        "judicial review", "government decisions", "statutory interpretation"
    ],
    "employment": ["workers comp"]
}

def assign_category(text):
    for category, words in keywords.items():
        if any(word in text.lower() for word in words):
            return category
    return 'other'  # Default category if no keywords match

# Apply the function to the cleaned text column
df['case_category'] = df['cleaned_text'].apply(assign_category)

# Display the sorted dataset
print(df[['cleaned_text', 'case_category']].head())

# Save the sorted dataset to a CSV file
output_file = 'legalData/sorted_legal_text.csv'
df.to_csv(output_file, index=False)

print(f"Sorted dataset saved to {output_file}")

                                        cleaned_text case_category
0  ordinarily discretion exercised cost follow ev...         other
1  general principle governing exercise discretio...         other
2  ordinarily discretion exercised cost follow ev...         other
3  general principle governing exercise discretio...         other
4  preceding general principle inform exercise di...         other
Sorted dataset saved to legalData/sorted_legal_text.csv


In [72]:
output_dir = 'legalData'
os.makedirs(output_dir, exist_ok=True)

# Save the cleaned text to a CSV file
output_file = os.path.join(output_dir, 'cleaned_legal_text.csv')
df[['case_text', 'cleaned_text']].to_csv(output_file, index=False)

print(f"Cleaned text saved to {output_file}")

Cleaned text saved to legalData\cleaned_legal_text.csv


In [73]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['case_category'], test_size=0.2, random_state=777
)

# Create a pipeline with TF-IDF vectorization, scaling, and logistic regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  # Convert text to numerical features
    ('scaler', MaxAbsScaler()),                     # Scale the features
    ('logistic', LogisticRegression(max_iter=500, solver='liblinear', C=1.0, penalty='l2'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8601160696417851
Classification Report:
                           precision    recall  f1-score   support

          administrative       0.92      0.57      0.70       120
                business       0.87      0.75      0.81       800
                criminal       0.88      0.59      0.70       219
                  family       0.99      0.53      0.69       143
financial_and_securities       0.00      0.00      0.00         1
                   other       0.85      0.97      0.91      2604
                property       0.86      0.81      0.83      1110

                accuracy                           0.86      4997
               macro avg       0.77      0.60      0.66      4997
            weighted avg       0.86      0.86      0.85      4997



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [74]:
# Save the pipeline (model) and vectorizer
joblib.dump(pipeline, 'legal_text_classification_model.pkl')

['legal_text_classification_model.pkl']

In [75]:
import joblib

# Load the pre-trained model and TF-IDF vectorizer
model = joblib.load('legal_text_classification_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Define a function to predict the category of a legal document
def predict_category(text):
    # Preprocess the input text
    cleaned_text = preprocess_text(text)
    
    # Transform the text using the TF-IDF vectorizer
    text_tfidf = tfidf.transform([cleaned_text])
    
    # Make a prediction
    prediction = model.predict(text_tfidf)
    
    return prediction[0]
