# REQUIRED LIBRARY

In [1]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install essential libraries
!pip install -q transformers torch torch-geometric scikit-learn pandas numpy

# Baseline Approach - TF-IDF + KMeans Clustering with Balanced Data Sampling Undersampling

TF-IDF + KMeans Clustering with Balanced Data Sampling via Undersampling: This approach aims to address class imbalance by sampling an equal number of data points from both classes. The rationale is that equal representation might allow the KMeans algorithm to form more distinct clusters corresponding to each class.

In [None]:
import json
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch_geometric.data import Data
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and Parse JSON Data with error handling
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSONDecodeError: {e}")
                continue
    return data


# Preprocess each row to concatenate cell values
def preprocess_row(row):
    return ' '.join([cell['value'] for cell in row['values']])

# Convert dataset into a structured format
def prepare_data(data):
    rows, labels = [], []
    for doc in data:
        for row in doc:
            text = preprocess_row(row)
            rows.append(text)
            labels.append(1 if row.get('type') == 'HEADERS' else 0)
    return rows, labels

# Split data into train and test with a smaller test size
def split_data(rows, labels):
    return train_test_split(rows, labels, test_size=0.05, random_state=42)

# Improved Baseline Approach - TF-IDF + KMeans Clustering with Balanced Data Sampling
def tfidf_kmeans(train_rows, train_labels, test_rows, test_labels):
    # Balance training data by undersampling
    train_data = pd.DataFrame({'text': train_rows, 'label': train_labels})
    headers = train_data[train_data['label'] == 1]
    non_headers = train_data[train_data['label'] == 0].sample(n=len(headers), random_state=42)
    balanced_data = pd.concat([headers, non_headers])

    vectorizer = TfidfVectorizer(max_features=500)
    balanced_features = vectorizer.fit_transform(balanced_data['text'])
    test_features = vectorizer.transform(test_rows)

    # Use KMeans clustering on balanced data
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(balanced_features)

    # Predict and evaluate
    balanced_labels = balanced_data['label'].values
    predictions = kmeans.predict(test_features)
    predictions = [1 if pred == kmeans.labels_[0] else 0 for pred in predictions]  # Adjust cluster to label mapping

    print("Improved KMeans Clustering Results:")
    print(classification_report(test_labels, predictions))

# Load Data
file_path = '/content/drive/MyDrive/Dataset/document-standardization-training-dataset.txt'
data = load_data(file_path)
rows, labels = prepare_data(data)
train_rows, test_rows, train_labels, test_labels = split_data(rows, labels)

# Experiment with Improved Baseline Method
tfidf_kmeans(train_rows, train_labels, test_rows, test_labels)

Improved KMeans Clustering Results:
              precision    recall  f1-score   support

           0       1.00      0.20      0.34    122125
           1       0.04      1.00      0.07      3687

    accuracy                           0.23    125812
   macro avg       0.52      0.60      0.20    125812
weighted avg       0.97      0.23      0.33    125812



# Baseline Approach - TF-IDF + KMeans Clustering with Balanced Data Sampling using Oversampling

TF-IDF + KMeans Clustering with Balanced Data Sampling via Oversampling: Similar to the first method, this approach also addresses class imbalance but by oversampling the minority class (headers) to match the majority class (non-headers). The goal is to prevent the model from being biased toward the majority class.

In [None]:
import json
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.utils import resample
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch_geometric.data import Data
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and Parse JSON Data with error handling
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSONDecodeError: {e}")
                continue
    return data

# Preprocess each row to concatenate cell values
def preprocess_row(row):
    return ' '.join([cell['value'] for cell in row['values']])

# Convert dataset into a structured format
def prepare_data(data):
    rows, labels = [], []
    for doc in data:
        for row in doc:
            text = preprocess_row(row)
            rows.append(text)
            labels.append(1 if row.get('type') == 'HEADERS' else 0)
    return rows, labels

# Split data into train and test with a smaller test size
def split_data(rows, labels):
    return train_test_split(rows, labels, test_size=0.05, random_state=42)

# Improved Baseline Approach - TF-IDF + KMeans Clustering with Balanced Data Sampling using Oversampling
def tfidf_kmeans_oversampling(train_rows, train_labels, test_rows, test_labels):
    # Balance training data by oversampling
    train_data = pd.DataFrame({'text': train_rows, 'label': train_labels})
    headers = train_data[train_data['label'] == 1]
    non_headers = train_data[train_data['label'] == 0]

    # Oversample the minority class (headers) to match the number of majority class samples
    if len(headers) < len(non_headers):
        headers = resample(headers, replace=True, n_samples=len(non_headers), random_state=42)
    else:
        non_headers = resample(non_headers, replace=True, n_samples=len(headers), random_state=42)

    balanced_data = pd.concat([headers, non_headers])

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=500)
    balanced_features = vectorizer.fit_transform(balanced_data['text'])
    test_features = vectorizer.transform(test_rows)

    # Use KMeans clustering on balanced data
    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(balanced_features)

    # Predict and evaluate
    balanced_labels = balanced_data['label'].values
    predictions = kmeans.predict(test_features)
    predictions = [1 if pred == kmeans.labels_[0] else 0 for pred in predictions]  # Adjust cluster to label mapping

    print("Improved KMeans Clustering Results with Oversampling:")
    print(classification_report(test_labels, predictions))

# Load Data
file_path = '/content/drive/MyDrive/Dataset/document-standardization-training-dataset.txt'
data = load_data(file_path)
rows, labels = prepare_data(data)
train_rows, test_rows, train_labels, test_labels = split_data(rows, labels)

# Experiment with Improved Baseline Method using Oversampling
tfidf_kmeans_oversampling(train_rows, train_labels, test_rows, test_labels)


Improved KMeans Clustering Results with Oversampling:
              precision    recall  f1-score   support

           0       0.07      0.00      0.00    122125
           1       0.02      0.74      0.04      3687

    accuracy                           0.02    125812
   macro avg       0.05      0.37      0.02    125812
weighted avg       0.07      0.02      0.00    125812



# Enhanced Approach with TF-IDF + SMOTE + RandomForest

Enhanced RandomForest with SMOTE:


TF-IDF Vectorization: This converts text data into a numerical format suitable for machine learning algorithms, capturing the importance of each word within the dataset documents.


Feature Selection using Chi-Squared Test: This helps in selecting the most relevant features for the classifier, reducing dimensionality and potentially improving model performance.


SMOTE (Synthetic Minority Over-sampling Technique): This technique helps to overcome the issue of class imbalance by generating synthetic examples rather than over-sampling with replacement.


RandomForest Classifier: An ensemble learning method used for classification that operates by constructing multiple decision trees during training and outputs the class that is the mode of the classes of the individual trees.

In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2

# Load and Parse JSON Data with error handling
def load_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSONDecodeError: {e}")
                continue
    return data

# Preprocess each row to concatenate cell values
def preprocess_row(row):
    return ' '.join([cell['value'] for cell in row['values']])

# Convert dataset into a structured format
def prepare_data(data):
    rows, labels = [], []
    for doc in data:
        for row in doc:
            text = preprocess_row(row)
            rows.append(text)
            labels.append(1 if row.get('type') == 'HEADERS' else 0)
    return rows, labels

# Split data into train and test with a smaller test size
def split_data(rows, labels):
    return train_test_split(rows, labels, test_size=0.05, random_state=42)

# Enhanced Approach with TF-IDF + SMOTE + RandomForest
def tfidf_smote_rf(train_rows, train_labels, test_rows, test_labels):
    # TF-IDF Vectorization with Feature Selection
    vectorizer = TfidfVectorizer(max_features=500)
    selector = SelectKBest(chi2, k=300)  # Select top 300 features

    # SMOTE for balancing the classes
    smote = SMOTE(random_state=42)

    # Classifier
    classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Create Pipeline
    pipeline = Pipeline([
        ('tfidf', vectorizer),
        ('feature_selection', selector),
        ('smote', smote),
        ('classifier', classifier)
    ])

    # Transform train_rows to match SMOTE input requirements
    train_tfidf = vectorizer.fit_transform(train_rows)
    train_tfidf = selector.fit_transform(train_tfidf, train_labels)
    train_rows_balanced, train_labels_balanced = smote.fit_resample(train_tfidf, train_labels)

    # Train the model
    classifier.fit(train_rows_balanced, train_labels_balanced)

    # Evaluate on the test data
    test_tfidf = vectorizer.transform(test_rows)
    test_tfidf = selector.transform(test_tfidf)
    predictions = classifier.predict(test_tfidf)

    # Classification Report
    print("Enhanced RandomForest Results with SMOTE:")
    print(classification_report(test_labels, predictions))

# Load Data
file_path = '/content/drive/MyDrive/Dataset/document-standardization-training-dataset.txt'
data = load_data(file_path)
rows, labels = prepare_data(data)
train_rows, test_rows, train_labels, test_labels = split_data(rows, labels)

# Experiment with Enhanced Method
tfidf_smote_rf(train_rows, train_labels, test_rows, test_labels)


Enhanced RandomForest Results with SMOTE:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96    122125
           1       0.30      0.99      0.46      3687

    accuracy                           0.93    125812
   macro avg       0.65      0.96      0.71    125812
weighted avg       0.98      0.93      0.95    125812



# Transformer Model Training with MiniLM and Mixed Precision

MiniLM Training with Mixed Precision:


Tokenization: The use of MiniLM tokenizer to convert text into a format suitable for input into the MiniLM model.


MiniLM for Sequence Classification: A smaller, faster variant of the BERT model optimized for lower memory consumption and faster performance without substantial loss in model efficacy.


Mixed Precision Training: Utilizes both 32-bit and 16-bit floating-point types during training to lower memory usage and speed up the training process without losing model accuracy.


DataLoader and Batch Processing: Efficiently handles data in batches during training and evaluation, crucial for processing large datasets like this.

In [None]:
import json
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Set device for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and Parse JSON Data
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return data

# Preprocess each row to concatenate cell values
def preprocess_row(row):
    return ' '.join([cell['value'] for cell in row['values']])

# Convert dataset into a structured format
def prepare_data(data):
    rows, labels = [], []
    for doc in data:
        for row in doc:
            text = preprocess_row(row)
            rows.append(text)
            labels.append(1 if row.get('type') == 'HEADERS' else 0)
    return rows, labels

# Split data into train and test with a smaller test size
def split_data(rows, labels):
    return train_test_split(rows, labels, test_size=0.05, random_state=42)

# Custom Dataset for Transformer Models
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Faster Transformer Model Training with MiniLM and Mixed Precision
def train_transformer_model(train_rows, train_labels, test_rows, test_labels):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
    model = AutoModelForSequenceClassification.from_pretrained("microsoft/MiniLM-L12-H384-uncased", num_labels=2)
    model = model.to(device)

    # Mixed precision for faster training
    scaler = torch.cuda.amp.GradScaler()

    # Dataset and DataLoader
    train_dataset = TextDataset(train_rows, train_labels, tokenizer)
    test_dataset = TextDataset(test_rows, test_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Adjust batch size based on GPU memory
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    # Training Loop
    model.train()
    epochs = 3
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch in tqdm(train_loader, desc="Training"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Mixed precision forward pass
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()  # Scales loss for mixed precision
            scaler.step(optimizer)  # Optimizer step
            scaler.update()  # Update scaler for mixed precision

        print(f"Loss: {loss.item()}")

    # Evaluation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print("MiniLM Classification Results:")
    print(classification_report(true_labels, predictions))

# Load Data
file_path = '/content/drive/MyDrive/Dataset/document-standardization-training-dataset.txt'
data = load_data(file_path)
rows, labels = prepare_data(data)
train_rows, test_rows, train_labels, test_labels = split_data(rows, labels)

# Run MiniLM model with mixed precision for faster results
train_transformer_model(train_rows, train_labels, test_rows, test_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/MiniLM-L12-H384-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler()


Epoch 1/3


  with torch.cuda.amp.autocast():
Training: 100%|██████████| 149401/149401 [2:10:17<00:00, 19.11it/s]


Loss: 0.0006440639263018966
Epoch 2/3


Training: 100%|██████████| 149401/149401 [2:11:06<00:00, 18.99it/s]


Loss: 0.0003695249615702778
Epoch 3/3


Training: 100%|██████████| 149401/149401 [2:11:33<00:00, 18.93it/s]


Loss: 0.0018455982208251953


Evaluating: 100%|██████████| 7864/7864 [04:36<00:00, 28.42it/s]


MiniLM Classification Results:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    122125
           1       0.86      0.88      0.87      3687

    accuracy                           0.99    125812
   macro avg       0.93      0.94      0.93    125812
weighted avg       0.99      0.99      0.99    125812

