The goal of this project is to develop a system that can classify legal documents (such as court rulings, briefs, and statutes) based on their content and predict outcomes of legal cases. This project tackles a real-world problem by assisting law firms and legal professionals in analyzing large volumes of legal documents, predicting case outcomes, and making data-driven decisions.

In [281]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r'C:\Users\ayush\Downloads\justice.csv\justice.csv')  # Replace with your dataset path

# Check the first few rows of the dataset
print(df.head())

# Assuming the text data is in a column called 'text'
texts = df['text'].tolist()


   Unnamed: 0     ID                     name  \
0           0  50606              Roe v. Wade   
1           1  50613      Stanley v. Illinois   
2           2  50623  Giglio v. United States   
3           3  50632             Reed v. Reed   
4           4  50643     Miller v. California   

                                      href   docket  term  \
0    https://api.oyez.org/cases/1971/70-18    70-18  1971   
1  https://api.oyez.org/cases/1971/70-5014  70-5014  1971   
2    https://api.oyez.org/cases/1971/70-29    70-29  1971   
3     https://api.oyez.org/cases/1971/70-4     70-4  1971   
4    https://api.oyez.org/cases/1971/70-73    70-73  1971   

           first_party   second_party  \
0             Jane Roe     Henry Wade   
1  Peter Stanley, Sr.        Illinois   
2         John Giglio   United States   
3           Sally Reed     Cecil Reed   
4        Marvin Miller     California   

                                               facts  facts_len  \
0  <p>In 1970, Jane Roe 

KeyError: 'text'

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [239]:
import torch

class LegalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset objects
train_dataset = LegalDataset(train_encodings, train_labels.tolist())
test_dataset = LegalDataset(test_encodings, test_labels.tolist())


filtered_words ['see', 'outisde', 'court', 'get', 'hell']


In [241]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load a pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(train_labels)))

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

# Train the model
trainer.train()


In [243]:
# Evaluate the model
eval_results = trainer.evaluate()

print(f"Evaluation results: {eval_results}")


Stemmed Words: ['see', 'outisd', 'court', 'get', 'hell']


In [245]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['see', 'outisde', 'court', 'get', 'hell']


In [247]:
from nltk import pos_tag

In [249]:
pos_tags=nltk.pos_tag(words)
print("pos tagged words" , pos_tags)

pos tagged words [('Will', 'MD'), ('see', 'VB'), ('You', 'PRP'), ('outisde', 'VB'), ('court', 'NN'), ('get', 'NN'), ('to', 'TO'), ('hell', 'VB')]


In [251]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [253]:
corpus = [
    "The court ruled in favor of the defendant.",
    "The defendant was found guilty of the crime.",
    "The judge dismissed the case due to lack of evidence.",
    "The jury awarded damages to the plaintiff.",
    "The case was settled out of court."
]

In [255]:
tfidf=TfidfVectorizer()
tfidf_matrix=tfidf.fit_transform(corpus)
tfidf_df=pd.DataFrame(tfidf_matrix.toarray() , columns=tfidf.get_feature_names_out())

In [257]:
print(tfidf_df)

    awarded      case     court     crime   damages  defendant  dismissed  \
0  0.000000  0.000000  0.343162  0.000000  0.000000   0.343162   0.000000   
1  0.000000  0.000000  0.000000  0.425341  0.000000   0.343162   0.000000   
2  0.000000  0.294062  0.000000  0.000000  0.000000   0.000000   0.364482   
3  0.424127  0.000000  0.000000  0.000000  0.424127   0.000000   0.000000   
4  0.000000  0.380444  0.380444  0.000000  0.000000   0.000000   0.000000   

        due  evidence     favor  ...      jury      lack        of       out  \
0  0.000000  0.000000  0.425341  ...  0.000000  0.000000  0.239630  0.000000   
1  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.239630  0.000000   
2  0.364482  0.364482  0.000000  ...  0.000000  0.364482  0.205343  0.000000   
3  0.000000  0.000000  0.000000  ...  0.424127  0.000000  0.000000  0.000000   
4  0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.265664  0.471551   

   plaintiff     ruled   settled       the        to    

In [259]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, labels, test_size=0.2, random_state=42)

In [261]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize the SVM classifier with a linear kernel
svm_classifier = SVC(kernel='linear', random_state=42)

# Train the classifier
svm_classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred , zero_division=1))

Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

       civil       0.00      1.00      0.00       0.0
    criminal       1.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.50      0.50      0.00       1.0
weighted avg       1.00      0.00      0.00       1.0



In [214]:
from sklearn.model_selection import train_test_split

In [216]:
labels=['criminal', 'criminal' , 'civil' , 'civil' , 'civil']

In [218]:
print("Number of samples in X:", tfidf_df.shape[0])
print("Number of samples in y:", len(labels))


Number of samples in X: 5
Number of samples in y: 5


In [220]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, labels, test_size=0.2, random_state=42)

In [222]:
print("X_train shape:", X_train.shape)
print("y_train shape:", len(y_train))
print("X_test shape:", X_test.shape)
print("y_test shape:", len(y_test))

X_train shape: (4, 24)
y_train shape: 4
X_test shape: (1, 24)
y_test shape: 1


In [224]:
from sklearn.linear_model import LogisticRegression

In [226]:
lr_l1=LogisticRegression(penalty='l1' , solver='liblinear', random_state=42)
lr_l1.fit(X_train,y_train)
y_pred_l1=lr_l1.predict(X_test)

In [230]:
from sklearn.metrics import accuracy_score, classification_report
print("L1 Regularization - Accuracy:", accuracy_score(y_test, y_pred_l1))
print("L1 Regularization - Classification Report:\n", classification_report(y_test, y_pred_l1 ,zero_division=1 ))

L1 Regularization - Accuracy: 0.0
L1 Regularization - Classification Report:
               precision    recall  f1-score   support

       civil       0.00      1.00      0.00       0.0
    criminal       1.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.50      0.50      0.00       1.0
weighted avg       1.00      0.00      0.00       1.0



In [206]:
import numpy as np

# Count the occurrences of each class in the training data
class_counts = np.bincount(y_train)
print("Class Distribution in y_train:", class_counts)


ValueError: invalid literal for int() with base 10: 'civil'

In [232]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Initialize the GBM classifier with class weights
gbm_classifier = GradientBoostingClassifier(
    random_state=42,
    n_estimators=150,  # Increase the number of trees
    learning_rate=0.05,  # Reduce learning rate
    max_depth=5,  # Slightly deeper trees to allow more complex learning
    subsample=0.8  # Use only 80% of data for each tree to add randomness
)

# Train the model
gbm_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = gbm_classifier.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))


Accuracy: 0.0
Classification Report:
               precision    recall  f1-score   support

       civil       0.00      1.00      0.00       0.0
    criminal       1.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.50      0.50      0.00       1.0
weighted avg       1.00      0.00      0.00       1.0

