In [43]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import re
import time
import html
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

# Download necessary NLTK resources
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

In [None]:
# Create output directory for results
os.makedirs('model_results', exist_ok=True)


df_train = pd.read_csv('complete_cleaned_dataset.csv')
print(f"Dataset loaded with shape: {df_train.shape}")

Text preprocessing functions

In [15]:
custom_stopwords = [
    # Decision-specific terms
    'decision', 'decisions',
    'this decision', 'decision is', 'decision that', 'decision shall',
    # Regulation-specific terms
    'regulation', 'regulations', 'this regulation', 'regulation that', 'the regulation',
    'the directive', 
    # Directive-specific terms
    'directive', 'directives', 'this directive', 'the directive', 'directive is', 
    'directive shall', 'directive to'
]

def remove_punctuation(text):
    if isinstance(text, str):
        nopunc = [char for char in text if char not in string.punctuation]
        return ''.join(nopunc)
    return ""

def remove_stopwords(text):
    if isinstance(text, str):
        clean = [word for word in text.split() if word.lower() not in stopwords.words('english')]
        
        return ' '.join(clean)
    return ""

def clean_text(text):
    if isinstance(text, str):
      

        for term in custom_stopwords:
           text = re.sub(r'\b' + term + r'\b', '', text, flags=re.IGNORECASE)
        
        return text
    return ""

def stem_text(text):
    if isinstance(text, str):
        porter = nltk.PorterStemmer()
        stemmed_words = [porter.stem(word) for word in text.split()]
        return ' '.join(stemmed_words)
    return ""

In [16]:
df_train['combined_text'] = df_train['header'] +     " " + df_train['recitals'] 


In [17]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Text cleaning

In [19]:
from nltk.corpus import stopwords

# Load stopwords once
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if isinstance(text, str):
        clean = [word for word in text.split() if word.lower() not in stop_words]
        return ' '.join(clean)
    return ""

# Apply the function
df_train['combined_text'] = df_train['combined_text'].apply(remove_stopwords)


In [20]:
X_train = df_train['combined_text']
y_train = df_train['type']


In [21]:
df_test = pd.read_csv('test_dataset.csv')

df_test['combined_text'] = df_test['header'] + " " + df_test['recitals'] + " " + df_test['main_body']

from nltk.corpus import stopwords

# Load stopwords once
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    if isinstance(text, str):
        clean = [word for word in text.split() if word.lower() not in stop_words]
        return ' '.join(clean)
    return ""

# Apply the function

df_test['combined_text'] = df_test['combined_text'].apply(remove_stopwords) 

X_test = df_test['combined_text']
y_test = df_test['type']



Encode target variable

In [23]:
print("\nEncoding target variable...")
label_encoder = LabelEncoder()
label_encoder.fit(pd.concat([y_train, y_test]))

label_encoder.fit(pd.concat([y_train, y_test]))



# Print encoded class mapping
print("\nClass encoding mapping:")
for i, class_name in enumerate(label_encoder.classes_):
    print(f"{i}: {class_name}")


Encoding target variable...

Class encoding mapping:
0: Decision
1: Directive
2: Regulation


In [27]:
#Split the data
print("\nSplitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
print(f"Data split into training set ({X_train.shape[0]} samples) and testing set ({X_test.shape[0]} samples)")



Splitting data into training and test sets...
Data split into training set (36000 samples) and testing set (9000 samples)


In [28]:
# Convert to encoded versions for models that need it
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [30]:
print("\nPerforming feature extraction...")
count_vect = CountVectorizer(max_features=2000)
X_train_count = count_vect.fit_transform(X_train)
X_test_count = count_vect.transform(X_test)



Performing feature extraction...


In [31]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
X_test_tfidf = tfidf_transformer.transform(X_test_count)

print(f"TF-IDF features shape: {X_train_tfidf.shape}")

TF-IDF features shape: (36000, 2000)


# Function to train and evaluate a model


In [33]:
def train_and_evaluate(model, name, X_train, X_test, y_train, y_test, needs_encoded=False):
    print(f"\n{'-'*20} Training {name} {'-'*20}")
    start = time.time()
    
    # Use encoded labels if needed
    train_y = y_train_encoded if needs_encoded else y_train
    test_y = y_test_encoded if needs_encoded else y_test
    
    # Train the model
    model.fit(X_train, train_y)
    train_time = time.time() - start
    
    # Make predictions
    if needs_encoded:
        y_pred_encoded = model.predict(X_test)
        y_pred = label_encoder.inverse_transform(y_pred_encoded)
    else:
        y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Print results
    print(f"{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Training time: {train_time:.2f} seconds")
    
    # Save detailed report to file
    with open(f"model_results/{name.replace(' ', '_').lower()}_report.txt", 'w') as f:
        f.write(f"{name} Results:\n")
        f.write(f"Accuracy: {accuracy:.4f}\n")
        f.write(f"Training time: {train_time:.2f} seconds\n\n")
        f.write("Classification Report:\n")
        f.write(classification_report(y_test, y_pred))
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=label_encoder.classes_, 
                yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix - {name}')
    plt.tight_layout()
    plt.savefig(f"model_results/{name.replace(' ', '_').lower()}_confusion_matrix.png")
    plt.close()
    
    # Save the model
    joblib.dump(model, f"model_results/{name.replace(' ', '_').lower()}_model.joblib")
    
    return {
        'name': name,
        'accuracy': accuracy,
        'training_time': train_time,
        'model': model
    }


In [34]:
results = []


# 1. Multinomial Naive Bayes

In [36]:
mnb_model = MultinomialNB()
mnb_results = train_and_evaluate(mnb_model, "Multinomial Naive Bayes", X_train_tfidf, X_test_tfidf, y_train, y_test)
results.append(mnb_results)


-------------------- Training Multinomial Naive Bayes --------------------
Multinomial Naive Bayes Results:
Accuracy: 0.9022
Training time: 0.38 seconds


In [23]:
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_results = train_and_evaluate(lr_model, "Logistic Regression", X_train_tfidf, X_test_tfidf, y_train, y_test)
results.append(lr_results)


-------------------- Training Logistic Regression --------------------
Logistic Regression Results:
Accuracy: 0.9911
Training time: 9.68 seconds


In [24]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', class_weight='balanced')
svm_results = train_and_evaluate(svm_model, "Support Vector Machine", X_train_tfidf, X_test_tfidf, y_train, y_test)
results.append(svm_results)


-------------------- Training Support Vector Machine --------------------


KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=50, max_depth=20, min_samples_split=5, random_state=42)
rf_model.fit(X_train_tfidf, y_train)


y_pred = rf_model.predict(X_test_tfidf)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

# Print results
print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



NameError: name 'start' is not defined