In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline

In [3]:
# Load the dataset
df = pd.read_csv(r"C:\Users\yoshi\OneDrive\Desktop\CSMaster\CS439\FInalProj\legalData\legal_text_classification.csv")

# Display the first few rows
print(df.head())

# Print the column names to verify
print("Columns in the dataset:", df.columns)

# Check for missing values
print(df.isnull().sum())

# Check the distribution of case outcomes
if 'case_outcome' in df.columns:
    print(df['case_outcome'].value_counts())
else:
    print("The 'case_outcome' column is not present in the dataset.") 
    

  case_id case_outcome                                         case_title  \
0   Case1        cited  Alpine Hardwood (Aust) Pty Ltd v Hardys Pty Lt...   
1   Case2        cited  Black v Lipovac [1998] FCA 699 ; (1998) 217 AL...   
2   Case3        cited  Colgate Palmolive Co v Cussons Pty Ltd (1993) ...   
3   Case4        cited  Dais Studio Pty Ltd v Bullett Creative Pty Ltd...   
4   Case5        cited  Dr Martens Australia Pty Ltd v Figgins Holding...   

                                           case_text  
0  Ordinarily that discretion will be exercised s...  
1  The general principles governing the exercise ...  
2  Ordinarily that discretion will be exercised s...  
3  The general principles governing the exercise ...  
4  The preceding general principles inform the ex...  
Columns in the dataset: Index(['case_id', 'case_outcome', 'case_title', 'case_text'], dtype='object')
case_id           0
case_outcome      0
case_title        0
case_text       176
dtype: int64
case_outcome

In [4]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize text
        tokens = word_tokenize(text)
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        # Join tokens back into a string
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to the case_text column
df['cleaned_text'] = df['case_text'].apply(preprocess_text)

# Display the cleaned text
print(df[['case_text', 'cleaned_text']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yoshi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                           case_text  \
0  Ordinarily that discretion will be exercised s...   
1  The general principles governing the exercise ...   
2  Ordinarily that discretion will be exercised s...   
3  The general principles governing the exercise ...   
4  The preceding general principles inform the ex...   

                                        cleaned_text  
0  ordinarily discretion exercised cost follow ev...  
1  general principle governing exercise discretio...  
2  ordinarily discretion exercised cost follow ev...  
3  general principle governing exercise discretio...  
4  preceding general principle inform exercise di...  


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['case_outcome'], test_size=0.2, random_state=777
)

# Display the shapes of the resulting datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (19988,) (19988,)
Testing set shape: (4997,) (4997,)


In [6]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf.transform(X_test)

In [7]:
# Create a pipeline with scaling and logistic regression
pipeline = Pipeline([
    ('scaler', MaxAbsScaler()),
    ('logistic', LogisticRegression(max_iter=500, solver='liblinear', C=1.0, penalty='l2'))
])

# Train the model
pipeline.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = pipeline.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5555333199919952
Classification Report:
                precision    recall  f1-score   support

     affirmed       0.60      0.33      0.43        18
      applied       0.35      0.18      0.24       488
     approved       0.33      0.05      0.08        22
        cited       0.61      0.84      0.71      2471
   considered       0.36      0.17      0.23       335
    discussed       0.41      0.12      0.19       237
distinguished       0.55      0.16      0.25       112
     followed       0.42      0.27      0.33       430
  referred to       0.48      0.44      0.46       857
      related       1.00      0.15      0.26        27

     accuracy                           0.56      4997
    macro avg       0.51      0.27      0.32      4997
 weighted avg       0.52      0.56      0.51      4997



In [10]:
# Save the pipeline (model) and vectorizer
joblib.dump(pipeline, 'legal_text_classification_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [11]:
# Load the model and vectorizer
model = joblib.load('legal_text_classification_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Preprocess new text
new_text = "This is a new legal case about intellectual property."
cleaned_text = preprocess_text(new_text)

# Transform the text using TF-IDF
new_text_tfidf = tfidf.transform([cleaned_text])

# Make a prediction
prediction = model.predict(new_text_tfidf)
print("Predicted Label:", prediction[0])

Predicted Label: referred to
