# Working Paper: Who is Covered by Unemployment Insurance in China?
# Complementary Code File 3 for Appendix 2
# Neural Networks - Transformers (Chinese MacBERT-large)

This file complements Table A4.2, and was created for execution in Google Colab with GPU support. There are minor differences in the test metrics presented here and in the paper, since the file was fit again for publication.

## 1. Packages

Ensure you have the required packages installed:

In [1]:
# install packages
!pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
# packages
import os
import pandas as pd
import numpy as np
from sklearn.utils import resample

import torch
import transformers
from transformers import AdamW
from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

## 2. Load the prepare the data

In [3]:
# get to data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Y - UI
y_train = pd.read_csv('/content/drive/My Drive/data_ui/y_broad_train.csv')['coverage_broad']
y_test = pd.read_csv('/content/drive/My Drive/data_ui/y_broad_test.csv')['coverage_broad']
y_train.value_counts(), y_test.value_counts()

(coverage_broad
 0    4184
 1     299
 Name: count, dtype: int64,
 coverage_broad
 0    2790
 1     199
 Name: count, dtype: int64)

In [8]:
# X - ui
x_train = pd.read_csv('/content/drive/My Drive/data_ui/X_sen_train.csv', header=0)
x_test = pd.read_csv('/content/drive/My Drive/data_ui/X_sen_test.csv', header=0)
x_train.shape, x_test.shape

((4483, 1), (2989, 1))

In [9]:
x_test.head()

Unnamed: 0,sentences
0,购买培训服务和评价服务按政府采购规定实施
1,１６．第２０行指社会保险机构违反国家有关规定向种类经济活动担保，或以基金管理单位的资产、...
2,用人单位提供资料不完整的，失业保险经办机构应当一次性书面告知申请单位需要补齐的全部资料；根据...
3,自治区人力资源和社会保障厅自治区财政厅自治区国家税务局自治区地方税务局2015年5月25日
4,二、因劳动者本人原因中断就业：按相关规定中断就业和劳动者本人自愿中断就业的情形


Split the training data into a training set and a validation set.

In [10]:
# split off validation data
from sklearn.model_selection import train_test_split
x_train2, x_val, y_train2, y_val = train_test_split(x_train, y_train,
                                                    test_size=0.3, stratify=y_train, random_state=42)

Upsample the training set to facilitate model training.

In [None]:
# Select the minority class samples
minority_class_samples = x_train2[y_train2 == 1]

# Extract the corresponding labels for the minority class samples.
minority_class_labels = y_train2[y_train2 == 1]

# Upsample the minority class to match the majority class
X_upsampled, y_upsampled = resample(minority_class_samples, 
                                    minority_class_labels, 
                                    replace=True, 
                                    n_samples=x_train2[y_train2 == 0].shape[0], 
                                    random_state=123) 

# Put dataframes together again
X_bal = np.vstack((x_train2[y_train2 == 0], X_upsampled))
y_bal = np.hstack((y_train2[y_train2 == 0], y_upsampled))
X_bal.shape, y_bal.shape

((5858, 1), (5858,))

In [None]:
# check the dimensions
x_val.shape

(1345, 1)

In [15]:
X_bal = X_bal.astype(str)
X_bal

array([['\u3000\u3000四、加强市级统筹工作的组织领导\u3000\u3000各地政府要高度重视市级统筹工作，并把实行市级统筹作为完善社会保障体系建设的重要内容，切实加强领导，组织专门的人员和班子开展工作'],
       ['该项政策执行至2015年底'],
       ['\u3000\u3000十一、将第十八条改为第二十条，并将该条第二款修改为：“按前款规定标准计算的失业保险金，高于或者等于本省一类地区规定的职工最低月工资标准的，按照一类地区最低月工资标准的98%发放；低于或者等于海口市城市居民最低生活保障标准的150%，按照海口市城市居民最低生活保障标准的150％发放'],
       ...,
       ['\u3000第四章\u3000养老保险工作的管理与基金的管理和监督\u3000\u3000第十一条\u3000省人事厅负责研究制定全省机关、事业单位工作人员养老保险的改革方案和政策法规并组织实施，综合管理省级机关、事业单位的社会保险工作；省社保中心为全额拨款的事业单位，具体承办省级机关、事业单位社会保险业务工作，负责养老保险基金的筹集、给付、管理，并指导地县相关保险业务，协同做好退休人员社会化管理服务工作；省级各单位包括养老、失业、工伤在内的社会保险工作由各单位人事（干部）部门承担，并有专职人员负责本单位的基金收缴'],
       ['\u3000\u3000应发丧葬抚恤补助、农民合同制工人生活补助支出＝各市州前三年享受丧葬抚恤补助及农民合同制工人生活补助占全年失业保险金支出的平均比例×当年失业保险金支出'],
       ['\u3000\u3000第十条\u3000企业缴交的失业保险费在成本中列支；国家机关、事业单位及社会团体缴交的失业保险费在行政事业经费或自有资金中列支']],
      dtype='<U1348')

## 3. Load Model and Tokenizer

Options:
lightweight: "hfl/albert-chinese-tiny"

Chinese RoBERTa: "hfl/chinese-roberta-wwm-ext"  #


In [None]:
# Chinese MacBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "hfl/chinese-macbert-large"  # model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Tokenize the data

Adjust max_length if needed.

In [16]:
import torch
import pandas as pd
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=300):
        """
        Args:
            texts (list): List of text samples (X).
            labels (list): List of labels (y).
            tokenizer: Hugging Face tokenizer.
            max_length (int): Maximum token length.
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        # Tokenization
        encoding = self.tokenizer(
            text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }



In [None]:
# Create datasets
train_dataset = TextDataset(X_bal.flatten(), np.array(y_bal).flatten(), tokenizer)
val_dataset = TextDataset(np.array(x_val['sentences']).flatten(), np.array(y_val).flatten(), tokenizer)
test_dataset = TextDataset(np.array(x_test['sentences']).flatten(), np.array(y_test).flatten(), tokenizer)

## 5. Create a data loader

Use torch.utils.data.DataLoader to handle batch processing.
Adjust batch size if needed.

In [25]:
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## 6. Model Training

Use AdamW optimizer, cross-entropy loss, and GPU acceleration (if available).

In [27]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()



Training Loop with standard backpropagation.

In [28]:
# Training with early stop

from tqdm import tqdm

# Early stopping parameters
patience = 3  # Number of epochs to wait before stopping
best_val_loss = np.inf  # Initialize with a high value
patience_counter = 0  # Track number of epochs without improvement

epochs = 10  # Adjust based on dataset size

# Training Loop
for epoch in range(epochs):
    model.train() # set the model to training mode
    total_loss = 0 # initialize total loss

    # create progress bar (loop structure and visual output)
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    # Loop over the badges in the progress bar
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device) # move data to CPU/GPU
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad() # reset the gradients

        # Forward pass (make predictors, calculate loss)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass (calculate gradients, update parameters)
        loss.backward()
        optimizer.step()

        progress_bar.set_postfix(loss=loss.item()) # display loss value of current batch

    # Calculate and print average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")


    # Validation
    model.eval()  # Set model to eval mode (no gradient updates)
    val_loss = 0

    with torch.no_grad():  # No need to compute gradients
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")



    # Early Stopping Check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0  # Reset counter if loss improves
        torch.save(model.state_dict(), "best_model.pth")  # Save best model
    else:
        patience_counter += 1  # Increment counter if no improvement
        print(f"Early Stopping Counter: {patience_counter}/{patience}")

    if patience_counter >= patience:
        print("Early stopping triggered. Training stopped.")
        break  # Stop training if patience is exceeded


Epoch 1: 100%|██████████| 367/367 [15:40<00:00,  2.56s/it, loss=0.0023]


Epoch 1 completed. Average Loss: 0.2393
Epoch 1 Validation Loss: 0.1378


Epoch 2: 100%|██████████| 367/367 [15:43<00:00,  2.57s/it, loss=0.298]


Epoch 2 completed. Average Loss: 0.2827
Epoch 2 Validation Loss: 0.3955
Early Stopping Counter: 1/3


Epoch 3: 100%|██████████| 367/367 [15:42<00:00,  2.57s/it, loss=0.132]


Epoch 3 completed. Average Loss: 0.4461
Epoch 3 Validation Loss: 0.2590
Early Stopping Counter: 2/3


Epoch 4: 100%|██████████| 367/367 [15:42<00:00,  2.57s/it, loss=0.78]


Epoch 4 completed. Average Loss: 0.6322
Epoch 4 Validation Loss: 0.6490
Early Stopping Counter: 3/3
Early stopping triggered. Training stopped.


In [29]:
# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

  model.load_state_dict(torch.load("best_model.pth"))


<All keys matched successfully>

In [None]:
torch.save(model.state_dict(), "/content/drive/My Drive/UI_MacBERT_len300_10ep.pth")

## 7. Validation

Evaluate the model on the validation set.

In [30]:
# Evaluation: Accuracy & Sensitivity
from sklearn.metrics import accuracy_score

#def evaluate(model, dataloader, device="cuda"):
def evaluate(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    total_correct = 0
    total_samples = 0

    TP = 0  # True Positives
    FN = 0  # False Negatives

    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)  # Get predicted class

            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)

            # Convert to CPU for computation
            predictions = predictions.cpu().numpy()
            labels = labels.cpu().numpy()

            # Store for global metrics
            all_preds.extend(predictions)
            all_labels.extend(labels)

            # Compute TP & FN
            TP += ((predictions == 1) & (labels == 1)).sum()  # True Positives
            FN += ((predictions == 0) & (labels == 1)).sum()  # False Negatives

    accuracy = total_correct / total_samples
    sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0  # Avoid division by zero

    return accuracy, sensitivity


In [31]:
val_accuracy, val_sensitivity = evaluate(model, val_loader, device)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Sensitivity (Recall for Positive Class): {val_sensitivity:.4f}")

Validation Accuracy: 0.9584
Validation Sensitivity (Recall for Positive Class): 0.8222


## 8. Test the Model

After training, we use the test set for final evaluation.

In [32]:
test_accuracy, test_sensitivity = evaluate(model, test_loader, device)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Sensitivity (Recall for Positive Class): {test_sensitivity:.4f}")


Test Accuracy: 0.9485
Test Sensitivity (Recall for Positive Class): 0.8040


## 9. Make Predictions on New Text

After training, you can use the model to classify new sentences.

In [33]:
# load unlabelled data
x_unlabelled = pd.read_csv('/content/drive/My Drive/data_ui/X_sen_unlabelled.csv', header=0)

In [None]:
# define a prediction function
def classify_text_batch_with_manual_batching(texts, model, tokenizer, device, batch_size=32, max_length=300):
    model.eval()  # Set the model to evaluation mode
    all_predictions = []  # Initialize an empty list to store predictions

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]  # Get the current batch of texts

        # Ensure batch_texts is a list of strings
        batch_texts = [str(text) for text in batch_texts]

        encodings = tokenizer(
            batch_texts,  # Pass batch_texts as a list of sentences
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=max_length,  # Pass max_length to the tokenizer
        )

        input_ids = encodings["input_ids"].to(device)
        attention_mask = encodings["attention_mask"].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1).cpu().numpy()

        all_predictions.extend(predictions)

    labels = ["Positive" if pred == 1 else "Negative" for pred in all_predictions]
    return labels

In [None]:
# Assuming x_test has a column named 'sentences'
sentences = x_unlabelled['sentences'].tolist()

In [None]:
# make predictions
predicted_labels = classify_text_batch_with_manual_batching(sentences, model, tokenizer, device, max_length=300)

In [None]:
# Add the predictions to the data
x_unlabelled['predicted_label'] = predicted_labels

In [None]:
# check the predictions
x_unlabelled['predicted_label'].value_counts()

Unnamed: 0_level_0,count
predicted_label,Unnamed: 1_level_1
Negative,43975
Positive,4171


In [None]:
# save the data
x_unlabelled.to_csv('/content/drive/My Drive/data_ui/X_sen_unlabelled_predicted.csv', index=False)