In [1]:
from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


In [2]:
# read the (original) data from the bigquery
def read_bq(project_id, dataset_id, table_id):

    query = f"""
        SELECT *
        FROM {project_id}.{dataset_id}.{table_id}
    """

    query_job = bigquery_client.query(query)

    # Convert the result into a Pandas DataFrame
    df = query_job.to_dataframe()

    return df

In [3]:
PROJECT_ID, DATASET_ID, TABLE_ID = "intern-project-415606", "Criminal_Dataset", "criminal_data_self"
bigquery_client = bigquery.Client(project=PROJECT_ID)

In [4]:
dataset = read_bq(PROJECT_ID, DATASET_ID, TABLE_ID)
dataset = dataset.sort_values(by='extract_id')
dataset.head()

Unnamed: 0,extract_id,text,ner_underthesea,tag_underthesea,self_label
469932,0,:,O,CH,other
469936,0,;,O,CH,other
469935,0,:,O,CH,other
469934,0,;,O,CH,other
469933,0,:,O,CH,other


## Import Libraries

In [5]:
%pip install conlleval
%pip install datasets

Collecting conlleval
  Downloading conlleval-0.2-py3-none-any.whl (5.4 kB)
Installing collected packages: conlleval
Successfully installed conlleval-0.2
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.

In [6]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import os
import keras
import numpy as np
import tensorflow as tf
from keras import layers
from collections import Counter
import pickle

## Transformer block layer

In [13]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        print('attn_output1:', attn_output)
        attn_output = self.dropout1(attn_output, training=training)
        print('attn_output2:', attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        print('out1:', out1)
        ffn_output = self.ffn(out1)
        print('ffn_output:', ffn_output)
        ffn_output = self.dropout2(ffn_output, training=training)
        print('ffn_output:', ffn_output)
        return self.layernorm2(out1 + ffn_output)


## Token and Position Embedding Layer

In [8]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, inputs):
        maxlen = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        position_embeddings = self.pos_emb(positions)
        token_embeddings = self.token_emb(inputs)
        return token_embeddings + position_embeddings

## NER model class

In [9]:
class NERModel(keras.Model):
    def __init__(
        self, num_tags, vocab_size, maxlen=3136, embed_dim=32, num_heads=2, ff_dim=32
    ):
        super().__init__()
        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
        self.dropout1 = layers.Dropout(0.1)
        self.ff = layers.Dense(ff_dim, activation="relu")
        self.dropout2 = layers.Dropout(0.1)
        self.ff_final = layers.Dense(num_tags, activation="softmax")

    def call(self, inputs, training=False):
        x = self.embedding_layer(inputs)
        x = self.transformer_block(x)
        x = self.dropout1(x, training=training)
        x = self.ff(x)
        x = self.dropout2(x, training=training)
        x = self.ff_final(x)
        return x

## Data preprocessing

In [10]:
dataset

Unnamed: 0,extract_id,text,ner_underthesea,tag_underthesea,self_label
469932,0,:,O,CH,other
469936,0,;,O,CH,other
469935,0,:,O,CH,other
469934,0,;,O,CH,other
469933,0,:,O,CH,other
...,...,...,...,...,...
159789,2440,quốc tịch,O,N,other
159788,2440,Lớp 9/12,O,N,other
159787,2440,văn hóa,O,N,other
521781,2440,:,O,CH,other


In [11]:
# eliminate all the CH (punctuation)
dataset = dataset[dataset['tag_underthesea'] != 'CH']
# Define the mapping
label_map = {"N": 0, "M": 1, "other": 2}

# Replace numeric labels with NER labels
dataset['self_label'] = dataset['self_label'].map(label_map)

dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['self_label'] = dataset['self_label'].map(label_map)


Unnamed: 0,extract_id,text,ner_underthesea,tag_underthesea,self_label
5777,0,tự do,O,A,2
111148,0,thôn,B-LOC,N,2
111147,0,9/12 Nơi,O,N,2
111146,0,văn hóa,O,N,2
111145,0,Trình độ,O,N,2
...,...,...,...,...,...
159790,2440,dân tộc,O,N,2
159789,2440,quốc tịch,O,N,2
159788,2440,Lớp 9/12,O,N,2
159787,2440,văn hóa,O,N,2


In [12]:
dataset_group = {'id':[], 'token':[], 'ner_tag':[], 'pos_tag':[], 'self_tag':[]}

for index, row in dataset.iterrows():
    extract_id = row['extract_id']
    if extract_id in dataset_group['id']:
        group_index = dataset_group['id'].index(extract_id)
        dataset_group['token'][group_index].append(row['text'])
        dataset_group['ner_tag'][group_index].append(row['ner_underthesea'])
        dataset_group['pos_tag'][group_index].append(row['tag_underthesea'])
        dataset_group['self_tag'][group_index].append(row['self_label'])
    else:
        dataset_group['id'].append(extract_id)
        dataset_group['token'].append([row['text']])
        dataset_group['ner_tag'].append([row['ner_underthesea']])
        dataset_group['pos_tag'].append([row['tag_underthesea']])
        dataset_group['self_tag'].append([row['self_label']])

for group in dataset_group['id']:
    group_index = dataset_group['id'].index(group)
    print('Group ID:', group)
    print('Token:', dataset_group['token'][group_index])
    print('NER Tag:', dataset_group['ner_tag'][group_index])
    print('POS Tag:', dataset_group['pos_tag'][group_index])
    print('Self Tag:', dataset_group['self_tag'][group_index])
    print()
    break

Group ID: 0
Token: ['tự do', 'thôn', '9/12 Nơi', 'văn hóa', 'Trình độ', 'Thiên chúa giáo', 'Tôn giáo', 'Dân tộc', 'phiên', 'bà', 'Con', 'Lao động', 'Nghề nghiệp', 'tỉnh', 'huyện', 'ngày', 'xã', 'Giới tính', '25/01/1994', 'Nguyễn Văn C', 'Họ', 'Không', 'tên', '12', 'tại', 'cư trú', 'Tiền án', 'ông', 'Có mặt', 'họp', 'tiền sự', 'Nguyễn Thị Hà', 'Nam Sinh', 'Kinh', 'T', 'và', 'N', 'Nguyễn Văn Trung', 'A', 'con']
NER Tag: ['O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'B-PER', 'B-PER', 'I-LOC', 'O', 'I-LOC', 'B-PER', 'I-LOC', 'O']
POS Tag: ['A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Np', 'P', 'R', 'N', 'M', 'E', 'V', 'V', 'Nc', 'V', 'V', 'V', 'Np', 'Np', 'Np', 'Np', 'C', 'Np', 'Np', 'Np', 'Nc']
Self Tag: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2

In [13]:
from sklearn.model_selection import train_test_split

from datasets import Dataset, DatasetDict

# Assuming dataset_group is already constructed as in your previous code

# Convert dataset_group to a DataFrame
import pandas as pd
dataset_df = pd.DataFrame(dataset_group)

# Split data into train and test sets
train_df, test_df = train_test_split(dataset_df, test_size=0.2, random_state=42)

# Create train and test datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Construct DatasetDict
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print DatasetDict
print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['id', 'token', 'ner_tag', 'pos_tag', 'self_tag', '__index_level_0__'],
        num_rows: 1952
    })
    test: Dataset({
        features: ['id', 'token', 'ner_tag', 'pos_tag', 'self_tag', '__index_level_0__'],
        num_rows: 489
    })
})


In [14]:
print(type(dataset_dict['train']['self_tag'][0][0]))

<class 'int'>


## Turn into tensorflow acceptable **datatype**

In [15]:
def export_to_file(export_file_path, data):
    with open(export_file_path, "w") as f:
        for record in data:
            ner_tags = record["self_tag"]
            tokens = record["token"]
            if len(tokens) > 0:
                f.write(
                    str(len(tokens))
                    + "\t"
                    + "\t".join(tokens)
                    + "\t"
                    + "\t".join(map(str, ner_tags))
                    + "\n"
                )
try:
    os.mkdir('./data')
except:
    print('directory exist')
export_to_file("./data/crime_train.txt", dataset_dict['train'])
export_to_file("./data/crime_val.txt", dataset_dict['test'])

directory exist


In [16]:
def make_tag_lookup_table():
    ner_labels = ["[PAD]", "N", "M", "other"]
    return dict(zip(range(len(ner_labels)), ner_labels))

mapping = make_tag_lookup_table()
print(mapping)

{0: '[PAD]', 1: 'N', 2: 'M', 3: 'other'}


In [17]:
all_tokens = sum(dataset_dict["train"]["token"], [])
all_tokens_array = np.array(all_tokens)

counter = Counter(all_tokens_array)
print(len(counter))

num_tags = len(mapping)
vocab_size = 25000

# We only take (vocab_size - 2) most common words from the training data since
# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
# token and another one denoting a masking token
vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]

# Save the list to a file
with open('./data/vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)


# The StringLookup class will convert tokens to token IDs
lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)


22093


In [18]:
train_data = tf.data.TextLineDataset("./data/crime_train.txt")
val_data = tf.data.TextLineDataset("./data/crime_val.txt")

In [19]:
print(list(train_data.take(1).as_numpy_iterator()))

[b'135\t08/10/2020\tch\xe1\xba\xa5p h\xc3\xa0nh\tn\xe1\xbb\x99p\tph\xe1\xba\xa1t\tb\xe1\xbb\x8b\tb\xe1\xba\xaft\tgi\xe1\xbb\xaf\tthay \xc4\x91\xe1\xbb\x95i\tc\xc3\xb3 m\xe1\xba\xb7t\tc\xe1\xba\xa5m\t\xc4\x91i\tkh\xe1\xbb\x8fi\tc\xc6\xb0 tr\xc3\xba\t\xc4\x91\xc3\xa1nh b\xe1\xba\xa1c\tx\xc3\xa3\thuy\xe1\xbb\x87n\tt\xe1\xbb\x89nh\tgi\xe1\xbb\xaf\tph\xe1\xba\xa1t\tC\xc3\xb3\tvi ph\xe1\xba\xa1m\tt\xc3\xaan\tt\xc3\xaan\tng\xc3\xa0y\t02/3/1971\tg\xe1\xbb\x8di\tsinh\tc\xc6\xb0 tr\xc3\xba\tch\xe1\xba\xbft\tph\xe1\xba\xa1t\ttr\xc3\xba\tC\xc3\xb3\tsinh\tsinh\tti\xe1\xbb\x81n \xc3\xa1n\tn\xc6\xa1i\tngh\xe1\xbb\x81 nghi\xe1\xbb\x87p\tti\xe1\xbb\x81n s\xe1\xbb\xb1\tb\xe1\xbb\x8b\tsinh\tLao \xc4\x91\xe1\xbb\x99ng\tQu\xe1\xbb\x91c t\xe1\xbb\x8bch\th\xe1\xbb\x8dc v\xe1\xba\xa5n\tO x\xe1\xbb\xad ph\xe1\xba\xa1t\th\xc3\xa0nh ch\xc3\xadnh\th\xc3\xacnh th\xe1\xbb\xa9c\tti\xe1\xbb\x81n\tm\xe1\xbb\xa9c\th\xc3\xa0nh vi\tB\xe1\xbb\x8b c\xc3\xa1o\tng\xc3\xa0y\tng\xc3\xa0y\tbi\xe1\xbb\x87n ph\xc3\xa1p\tbi\xe1\xb

In [20]:
def map_record_to_training_data(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags


def convert_to_ids(tokens):
    return lookup_layer(tokens)

# We use `padded_batch` here because each record in the dataset has a
# different length.
batch_size = 16

train_dataset = (
    train_data.map(map_record_to_training_data)
    .map(lambda x, y: (convert_to_ids(x), y))
    .padded_batch(batch_size)
)
val_dataset = (
    val_data.map(map_record_to_training_data)
    .map(lambda x, y: (convert_to_ids(x), y))
    .padded_batch(batch_size)
)

ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)

In [21]:
train_dataset

<_PaddedBatchDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorSpec(shape=(None, None), dtype=tf.int64, name=None))>

In [35]:
for a, b in train_dataset:
    print(a)
    print(len(b[0]))
    break

tf.Tensor(
[[1756  102  249 ...    0    0    0]
 [   7    7    7 ...    0    0    0]
 [ 859  125  682 ...    0    0    0]
 ...
 [ 145   12    7 ...    0    0    0]
 [  73   70   44 ...    0    0    0]
 [ 160  101  259 ...    0    0    0]], shape=(16, 687), dtype=int64)
687


## custom loss function

In [23]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, reduction=keras.losses.Reduction.AUTO, name="custom_ner_loss"):
        super().__init__(reduction=reduction, name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=False, reduction=self.reduction  # Pass the reduction argument here
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

loss = CustomNonPaddingTokenLoss()

## Compile and fit the model

In [25]:
from keras.losses import SparseCategoricalCrossentropy

# Compile the model with a specified loss function and optimizer
ner_model.compile(optimizer="adam", loss=SparseCategoricalCrossentropy(reduction='auto'))
#ner_model.compile(optimizer="adam", loss=loss)
ner_model.fit(train_dataset, epochs=10)

def tokenize_and_convert_to_ids(text):
    tokens = text.split()
    return lowercase_and_convert_to_ids(tokens)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
ner_model.save("./data/ner_model")

In [37]:
def prediction_and_casting(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        if len(x[0]) > 3135:
            continue
        output = ner_model.predict(x, verbose=0)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    print(len(predicted_tags), len(predicted_tags))
    return predicted_tags, real_tags
    #evaluate(real_tags, predicted_tags)


y_pred, y_val = prediction_and_casting(val_dataset)

109252 109252


In [38]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate

def calculate_metrics(y_true, y_pred, labels):
    # Mapping labels to numeric indices
    label_to_index = {label: idx for idx, label in enumerate(labels)}
    index_to_label = {idx: label for label, idx in label_to_index.items()}

    y_true_mapped = [label_to_index[label] for label in y_true]
    y_pred_mapped = [label_to_index[label] for label in y_pred]

    # Overall metrics
    accuracy = accuracy_score(y_true_mapped, y_pred_mapped)
    precision = precision_score(y_true_mapped, y_pred_mapped, average='weighted')
    recall = recall_score(y_true_mapped, y_pred_mapped, average='weighted', zero_division=0)  # Set zero_division to 0
    f1 = f1_score(y_true_mapped, y_pred_mapped, average='weighted')

    # Per-label metrics
    per_label_accuracy = {}
    per_label_precision = {}
    per_label_recall = {}
    per_label_f1 = {}

    for label in labels:
        label_index = label_to_index[label]
        label_indices = [i for i, true_label in enumerate(y_true) if true_label == label]
        if len(label_indices) > 0:
            per_label_accuracy[label] = accuracy_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices])
            per_label_precision[label] = precision_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices], average='weighted')
            per_label_recall[label] = recall_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices], average='weighted', zero_division=0)  # Set zero_division to 0
            per_label_f1[label] = f1_score([y_true_mapped[i] for i in label_indices], [y_pred_mapped[i] for i in label_indices], average='weighted')
        else:
            per_label_accuracy[label] = 0
            per_label_precision[label] = 0
            per_label_recall[label] = 0
            per_label_f1[label] = 0

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'per_label_accuracy': per_label_accuracy,
        'per_label_precision': per_label_precision,
        'per_label_recall': per_label_recall,
        'per_label_f1': per_label_f1,
        'index_to_label': index_to_label
    }

#labels
labels = ['M', 'N', 'other']

metrics = calculate_metrics(y_val, y_pred, labels)

# Constructing table
table_data = []
for label in labels:
    table_data.append([label,
                       metrics['per_label_accuracy'][label],
                       metrics['per_label_precision'][label],
                       metrics['per_label_recall'][label],
                       metrics['per_label_f1'][label]])
overall_accuracy = calculate_metrics(y_val, y_pred, labels)
print("Overall Accuracy:".ljust(20), overall_accuracy['accuracy'])
print("Overall Precision:".ljust(20), overall_accuracy['precision'])
print("Overall Recall:".ljust(20), overall_accuracy['recall'])
print("Overall F1-score:".ljust(20), overall_accuracy['f1'])
print(tabulate(table_data, headers=["Label", "Accuracy", "Precision", "Recall", "F1-score"], tablefmt="grid"))


Overall Accuracy:    0.9819682934866181
Overall Precision:   0.9818736961365914
Overall Recall:      0.9819682934866181
Overall F1-score:    0.981840185697705
+---------+------------+-------------+----------+------------+
| Label   |   Accuracy |   Precision |   Recall |   F1-score |
| M       |   0.887909 |           1 | 0.887909 |   0.940627 |
+---------+------------+-------------+----------+------------+
| N       |   0.320872 |           1 | 0.320872 |   0.485849 |
+---------+------------+-------------+----------+------------+
| other   |   0.992874 |           1 | 0.992874 |   0.996424 |
+---------+------------+-------------+----------+------------+
