In [21]:
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from flask import Flask, request, jsonify


In [30]:
def load_data(filepath):
    try:
        with open(filepath, 'r') as file:
            data = pd.read_json(file)
        print("Data loaded successfully. Columns available: ", data.columns)
        return data
    except Exception as e:
        print(f"Failed to load data: {e}")

data = load_data('idmanual.json')
if data is not None:
    print(data.head())  # Display first few rows of the DataFrame to inspect it
else:
    print("Data is None. Check file path and contents.")


Data loaded successfully. Columns available:  Index(['id_tx', 'class_id', 'description', 'status'], dtype='object')
      id_tx class_id                                        description status
0  009-4140      009  Bank note acceptors for separating good bank n...      A
1  009-4136      009                                Fingerprint imagers      A
2  009-4133      009          Laboratory swabs [laboratory instruments]      A
3  009-4131      009                               Ear plugs for divers      A
4  009-4130      009                                      DVD recorders      A


In [31]:
def preprocess_data(data):
    try:
        label_encoder = LabelEncoder()
        data['encoded_class'] = label_encoder.fit_transform(data['class_id'])
        print("Preprocessing successful.")
        return data, label_encoder
    except KeyError as e:
        print(f"KeyError during preprocessing: {e}")
    except Exception as e:
        print(f"Unexpected error during preprocessing: {e}")

processed_data, label_encoder = preprocess_data(data)
if processed_data is not None:
    print(processed_data.head())
else:
    print("Processed data is None. Check data preparation steps.")


Preprocessing successful.
      id_tx class_id                                        description  \
0  009-4140      009  Bank note acceptors for separating good bank n...   
1  009-4136      009                                Fingerprint imagers   
2  009-4133      009          Laboratory swabs [laboratory instruments]   
3  009-4131      009                               Ear plugs for divers   
4  009-4130      009                                      DVD recorders   

  status  encoded_class  
0      A              9  
1      A              9  
2      A              9  
3      A              9  
4      A              9  


In [33]:
def encode_texts(tokenizer, texts, labels):
    try:
        # Convert pandas series to list if not already in list format
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.to_numpy()

        # Tokenize text
        encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

        # Create tensors
        input_ids = torch.tensor(encodings['input_ids'])
        attention_mask = torch.tensor(encodings['attention_mask'])
        labels = torch.tensor(labels, dtype=torch.long)  # Ensure labels are long type for classification

        # Create and return tensor dataset
        dataset = TensorDataset(input_ids, attention_mask, labels)
        return dataset
    except KeyError as e:
        print(f"KeyError in encode_texts: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error in encode_texts: {e}")
        return None


In [34]:
train_dataset = encode_texts(tokenizer, train_texts, train_labels)
val_dataset = encode_texts(tokenizer, val_texts, val_labels)

if train_dataset is not None and val_dataset is not None:
    print("Datasets created successfully.")
    print("Train dataset size:", len(train_dataset))
    print("Validation dataset size:", len(val_dataset))
else:
    print("Failed to create datasets.")


Datasets created successfully.
Train dataset size: 46961
Validation dataset size: 11741


In [36]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    try:
        for batch in data_loader:
            # Unpack batch correctly
            inputs, masks, labels = batch
            inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

            # Perform model training
            model.zero_grad()
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

    except ValueError as e:
        print(f"ValueError in train_epoch: {e}")
        print(f"Batch content: {[x.shape for x in batch]}")  # Debugging line
    except Exception as e:
        print(f"Unexpected error in train_epoch: {e}")

    return total_loss / len(data_loader)


In [37]:
# Create the datasets again with the debugged function
train_dataset = encode_texts(tokenizer, train_texts, train_labels)
val_dataset = encode_texts(tokenizer, val_texts, val_labels)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Check DataLoader output
for batch in train_loader:
    try:
        inputs, masks, labels = batch
        print("Batch unpacked successfully.")
    except ValueError as e:
        print(f"Error unpacking batch: {e}")
        print(f"Batch content: {[x.shape for x in batch]}")
        break  # Stop after the first error to avoid flooding output


Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked successfully.
Batch unpacked

In [None]:
def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in data_loader:
        inputs, masks, labels = batch
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(data_loader)
    return average_loss

# Training the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

epoch_count = 3
for epoch in range(epoch_count):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    print(f'Epoch {epoch + 1}, Training Loss: {train_loss}')


In [None]:
def validate_epoch(model, data_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            inputs, masks, labels = batch
            inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

    average_loss = total_loss / len(data_loader)
    return average_loss

val_loss = validate_epoch(model, val_loader, device)
print(f'Validation Loss: {val_loss}')
