In [24]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
import time
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [None]:
# Load datasets
df_train = pd.read_csv('../complete_cleaned_dataset.csv')
df_test = pd.read_csv('../test_dataset.csv')

# Sample data (adjust size as needed)
sample_train_size = 45000
sample_test_size = 6000

if len(df_train) > sample_train_size:
    df_train = df_train.sample(sample_train_size, random_state=42)

if len(df_test) > sample_test_size:
    df_test = df_test.sample(sample_test_size, random_state=42)
    
# Create balanced dataset
df_train_directive = df_train[df_train['type'] == 'Directive']
df_train_regulation = df_train[df_train['type'] == 'Regulation'].sample(frac=0.60, random_state=42)
df_train_decision = df_train[df_train['type'] == 'Decision'].sample(frac=0.60, random_state=42)

df_train = pd.concat([df_train_directive, df_train_regulation, df_train_decision])
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

# Do the same for test data
df_test_directive = df_test[df_test['type'] == 'Directive']
df_test_regulation = df_test[df_test['type'] == 'Regulation'].sample(frac=0.60, random_state=42)
df_test_decision = df_test[df_test['type'] == 'Decision'].sample(frac=0.60, random_state=42)

df_test = pd.concat([df_test_directive, df_test_regulation, df_test_decision])
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)

In [26]:
# Simple function to clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # Lowercase and remove non-alphanumeric
    text = re.sub(r'[^\w\s]', ' ', text.lower())
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Function to tokenize text
def tokenize_text(text):
    if not isinstance(text, str):
        return []
    
    # Tokenize and remove stopwords
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stopwords.words('english')]
    
    return tokens

# Process training data
df_train['header'] = df_train['header'].fillna('')
df_train['recitals'] = df_train['recitals'].fillna('')
df_train['main_body'] = df_train['main_body'].fillna('')
df_train['combined_text'] = df_train['header'] + " " + df_train['recitals'] + " " + df_train['main_body']
df_train['combined_text'] = df_train['combined_text'].apply(clean_text)

# Process test data
df_test['header'] = df_test['header'].fillna('')
df_test['recitals'] = df_test['recitals'].fillna('')
df_test['main_body'] = df_test['main_body'].fillna('')
df_test['combined_text'] = df_test['header'] + " " + df_test['recitals'] + " " + df_test['main_body']
df_test['combined_text'] = df_test['combined_text'].apply(clean_text)

# Prepare features and targets
X_train = df_train['combined_text']
y_train = df_train['type']
X_test = df_test['combined_text']
y_test = df_test['type']

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [27]:
# Create N-gram features using CountVectorizer
ngram_vectorizer = CountVectorizer(
    ngram_range=(1, 2),  # Use both unigrams and bigrams
    max_features=2000,   # Limit to top 2000 features
    min_df=5,            # Ignore terms that appear in less than 5 documents
    stop_words=stopwords.words('english')
)

# Fit and transform the training data
X_train_ngrams = ngram_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_ngrams = ngram_vectorizer.transform(X_test)

print(f"N-gram features shape for training: {X_train_ngrams.shape}")
print(f"N-gram features shape for testing: {X_test_ngrams.shape}")

# Get feature names (optional, for exploration)
feature_names = ngram_vectorizer.get_feature_names_out()
print(f"Top 10 N-gram features: {feature_names[:10]}")

N-gram features shape for training: (10418, 2000)
N-gram features shape for testing: (1392, 2000)
Top 10 N-gram features: ['00' '000' '000 tonnes' '10' '10 regulation' '10 thereof' '100'
 '100 kilograms' '11' '11 regulation']


In [28]:
# Train SVM model
svm_model = LinearSVC(C=1.0, dual=False, class_weight='balanced', max_iter=2000)
svm_model.fit(X_train_ngrams, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test_ngrams)

# Calculate and print accuracy
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")

# Print classification report
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.9971

SVM Classification Report:
              precision    recall  f1-score   support

    Decision       1.00      0.99      0.99       359
   Directive       0.99      1.00      1.00       240
  Regulation       1.00      1.00      1.00       793

    accuracy                           1.00      1392
   macro avg       1.00      1.00      1.00      1392
weighted avg       1.00      1.00      1.00      1392



In [29]:
# Train Multinomial Naive Bayes
mnb_model = MultinomialNB()
mnb_model.fit(X_train_ngrams, y_train)

# Make predictions
y_pred_mnb = mnb_model.predict(X_test_ngrams)

# Calculate and print accuracy
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)
print(f"Multinomial Naive Bayes Accuracy: {accuracy_mnb:.4f}")

# Print classification report
print("\nMultinomial Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_mnb))

Multinomial Naive Bayes Accuracy: 0.9641

Multinomial Naive Bayes Classification Report:
              precision    recall  f1-score   support

    Decision       0.94      0.93      0.93       359
   Directive       0.90      0.95      0.93       240
  Regulation       0.99      0.98      0.99       793

    accuracy                           0.96      1392
   macro avg       0.95      0.96      0.95      1392
weighted avg       0.96      0.96      0.96      1392



In [30]:
# Convert sparse matrix to dense for Gaussian NB
X_train_dense = X_train_ngrams.toarray()
X_test_dense = X_test_ngrams.toarray()

# Train Gaussian Naive Bayes
gnb_model = GaussianNB()
gnb_model.fit(X_train_dense, y_train)

# Make predictions
y_pred_gnb = gnb_model.predict(X_test_dense)

# Calculate and print accuracy
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)
print(f"Gaussian Naive Bayes Accuracy: {accuracy_gnb:.4f}")

# Print classification report
print("\nGaussian Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_gnb))

Gaussian Naive Bayes Accuracy: 0.9935

Gaussian Naive Bayes Classification Report:
              precision    recall  f1-score   support

    Decision       0.99      0.98      0.99       359
   Directive       0.98      0.99      0.98       240
  Regulation       1.00      1.00      1.00       793

    accuracy                           0.99      1392
   macro avg       0.99      0.99      0.99      1392
weighted avg       0.99      0.99      0.99      1392



In [31]:
# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
lr_model.fit(X_train_ngrams, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test_ngrams)

# Calculate and print accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")

# Print classification report
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.9950

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    Decision       1.00      0.99      0.99       359
   Directive       0.99      1.00      0.99       240
  Regulation       1.00      1.00      1.00       793

    accuracy                           0.99      1392
   macro avg       0.99      0.99      0.99      1392
weighted avg       0.99      0.99      0.99      1392



In [32]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=5, random_state=42)
rf_model.fit(X_train_ngrams, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test_ngrams)

# Calculate and print accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")

# Print classification report
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 1.0000

Random Forest Classification Report:
              precision    recall  f1-score   support

    Decision       1.00      1.00      1.00       359
   Directive       1.00      1.00      1.00       240
  Regulation       1.00      1.00      1.00       793

    accuracy                           1.00      1392
   macro avg       1.00      1.00      1.00      1392
weighted avg       1.00      1.00      1.00      1392



In [33]:
# Display top N-gram features for each class (for Multinomial Naive Bayes)
top_n = 10  # Number of top features to show

for i, class_name in enumerate(label_encoder.classes_):
    # Get feature importance from Multinomial NB
    feature_importance = mnb_model.feature_log_prob_[i]
    
    # Get indices of top features
    top_indices = feature_importance.argsort()[-top_n:][::-1]
    
    print(f"\nTop {top_n} features for class '{class_name}':")
    for idx in top_indices:
        print(f"  - {feature_names[idx]}")


Top 10 features for class 'Decision':
  - decision
  - article
  - european
  - commission
  - community
  - whereas
  - council
  - shall
  - regulation
  - member

Top 10 features for class 'Directive':
  - directive
  - shall
  - member
  - states
  - member states
  - article
  - commission
  - whereas
  - european
  - annex

Top 10 features for class 'Regulation':
  - regulation
  - shall
  - ec
  - article
  - regulation ec
  - commission
  - european
  - eec
  - whereas
  - regulation eec
