In [None]:
# --- Import Libraries ---
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import os
import joblib

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# --- Load Dataset ---
file_path = r"C:\Users\yoshi\OneDrive\Desktop\CSMaster\CS439\FInalProj\legalData\legal_text_classification.csv"
df = pd.read_csv(file_path)

# Display dataset info
print("Dataset loaded:")
print(df.head())
print("Columns:", df.columns)


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = word_tokenize(text)
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(tokens)
    return ''

df['cleaned_text'] = df['case_text'].apply(preprocess_text)
print("Preprocessed text:")
print(df[['case_text', 'cleaned_text']].head())


In [None]:
keywords = {
    'family': ['children', 'custody', 'divorce', 'marriage', 'adoption', 'parenting orders', 'child support', 'spousal maintenance', 'family violence', 'guardianship', 'prenuptial agreements'],
    'property': ['property', 'ownership', 'land', 'real estate', 'lease', 'easements', 'mortgages', 'foreclosure', 'zoning', 'landlord', 'tenant', 'eviction'],
    'criminal': ['theft', 'murder', 'assault', 'fraud', 'crime', 'sentencing', 'bail', 'parole', 'prosecution', 'homicide', 'robbery', 'drug offenses'],
    'business': ['contract', 'agreement', 'corporation', 'partnership', 'mergers', 'franchises', 'intellectual property', 'trade practices'],
    'financial_and_securities': ['securities', 'investments', 'insider trading', 'market manipulation', 'financial services'],
    'administrative': ['judicial review', 'government decisions', 'statutory interpretation'],
    'employment': ['workers comp']
}

def assign_category(text):
    for category, words in keywords.items():
        if any(word in text.lower() for word in words):
            return category
    return 'other'

df['case_category'] = df['cleaned_text'].apply(assign_category)
print("Assigned categories:")
print(df[['cleaned_text', 'case_category']].head())


In [None]:
output_dir = 'legalData'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'cleaned_legal_text.csv')
df[['case_text', 'cleaned_text']].to_csv(output_file, index=False)
print(f"Cleaned text saved to {output_file}")

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['cleaned_text'])

num_clusters = len(keywords)
kmeans = KMeans(n_clusters=num_clusters, random_state=777)
df['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Map clusters to categories
cluster_to_category = {}
for cluster in range(num_clusters):
    cluster_docs = df[df['cluster'] == cluster]['cleaned_text']
    cluster_keywords = ' '.join(cluster_docs).split()
    for category, words in keywords.items():
        if any(word in cluster_keywords for word in words):
            cluster_to_category[cluster] = category
            break
    else:
        cluster_to_category[cluster] = 'other'

df['case_category'] = df['cluster'].map(cluster_to_category)
print("Clustered dataset:")
print(df[['cleaned_text', 'cluster', 'case_category']].head())


In [None]:
clustered_file = os.path.join(output_dir, 'clustered_legal_text.csv')
df.to_csv(clustered_file, index=False)
print(f"Clustered dataset saved to {clustered_file}")

In [None]:
silhouette_avg = silhouette_score(tfidf_matrix, df['cluster'])
calinski_harabasz = calinski_harabasz_score(tfidf_matrix.toarray(), df['cluster'])
print(f"Silhouette Score: {silhouette_avg:.2f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.2f}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], df['case_category'], test_size=0.2, random_state=777)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('scaler', MaxAbsScaler()),
    ('logistic', LogisticRegression(max_iter=500, solver='liblinear', C=1.0, penalty='l2'))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Results:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
model_file = 'legal_text_classification_model.pkl'
joblib.dump(pipeline, model_file)
print(f"Model saved to {model_file}")