# This file is modified from the original source code from https://colab.research.google.com/drive/1CWamaQH1Lgd7mSZ0UZ4jx2AUMAGDpsfq?usp=sharing

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
root = f' ' ##your path data

In [None]:
path_ne_csv = os.path.join(root,'ne_sample_submission.csv')
path_ne_txt = os.path.join(root,'ne_test.txt')

In [None]:
with open(path_ne_txt) as f:
    contents = f.read()
words = contents.split("\n")
wordss = words[:69561]   #deleted 2 last spaces

wordsss = np.array(wordss)
word_list = list(wordsss[0:])

index = 0
for word in word_list :
    if word == '' : word_list[index] = ' '
    index += 1

# NER for Thai

## load dataset

### import pre-trian

In [None]:
!pip -q install torch==1.5.0 torchtext==0.4.0 torchvision==0.6.0
!pip -q install transformers==3.5.0 thai2transformers==0.1.2

In [None]:
! pip install -q datasets transformers[sentencepiece] simpletransformers

Load your dataset such as LST20

In [None]:
from datasets import load_dataset
lst20 = load_dataset("lst20", data_dir="LST20_Corpus")

In [None]:
lst20

## import data

In [None]:
import pandas as pd
train_df = pd.DataFrame(lst20['train'])
validation_df = pd.DataFrame(lst20['validation'])
test_df = pd.DataFrame(lst20['test'])

In [None]:
train_df.head()

In [None]:
validation_df.head()

In [None]:
test_df.head()

In [None]:
# See tags from https://github.com/huggingface/datasets/blob/master/datasets/lst20/lst20.py
_POS_TAGS = ["NN", "VV", "PU", "CC", "PS", "AX", "AV", "FX", "NU", "AJ", "CL", "PR", "NG", "PA", "XX", "IJ"]
_NER_TAGS = [
        "O",
        "B_BRN",
        "B_DES",
        "B_DTM",
        "B_LOC",
        "B_MEA",
        "B_NUM",
        "B_ORG",
        "B_PER",
        "B_TRM",
        "B_TTL",
        "I_BRN",
        "I_DES",
        "I_DTM",
        "I_LOC",
        "I_MEA",
        "I_NUM",
        "I_ORG",
        "I_PER",
        "I_TRM",
        "I_TTL",
        "E_BRN",
        "E_DES",
        "E_DTM",
        "E_LOC",
        "E_MEA",
        "E_NUM",
        "E_ORG",
        "E_PER",
        "E_TRM",
        "E_TTL",
    ]
_CLAUSE_TAGS = ["O", "B_CLS", "I_CLS", "E_CLS"]

In [None]:
list(map(lambda x: _NER_TAGS[x], train_df["ner_tags"][0]))

## preprocessing

In [None]:
def convert_to_simple_transformer_format(df, field_name, tags):
  sentence_id = []
  words = []
  labels = []

  #Limit at 1000 rows for speed.
  for (idx, r) in df[:1000].iterrows():
    # print(idx)
    for (i, t) in enumerate(r['tokens']):
      # print(i,t)
      sentence_id.append(idx)
      words.append(t)
      labels.append(tags[r[field_name][i]])

  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )


In [None]:
train_ = convert_to_simple_transformer_format(train_df, "ner_tags", _NER_TAGS)
train_.head()

In [None]:
validation_ = convert_to_simple_transformer_format(validation_df, "ner_tags", _NER_TAGS)
validation_.head()

In [None]:
test_ = convert_to_simple_transformer_format(test_df, "ner_tags", _NER_TAGS)
test_.head()

## padded_batch

In [None]:
train_[['words','labels'][:]]

In [None]:
train_list = []
for d in zip(train_['words'].iteritems(), train_['labels'].iteritems()):
  train_list.append([d[0][1], d[1][1]])

In [None]:
train_list

In [None]:
train_list = tf.data.Dataset.from_tensor_slices(train_list)

In [None]:
batch_size = 16

train_dataset = (
    train_list
    .padded_batch(batch_size)
)

## build model

## airesearch/wangchanberta-base-att-spm-uncased

In [None]:
import torch
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs
from transformers import AutoTokenizer, AutoModelForMaskedLM
#transformers
from transformers import (
    CamembertTokenizer,
    AutoTokenizer,
    AutoModel,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)


#thai2transformers
import thai2transformers
from thai2transformers.preprocess import process_transformers
from thai2transformers.metrics import (
    classification_metrics, 
    multilabel_classification_metrics,
)
from thai2transformers.tokenizers import (
    ThaiRobertaTokenizer,
    ThaiWordsNewmmTokenizer,
    ThaiWordsSyllableTokenizer,
    FakeSefrCutTokenizer,
    SEFR_SPLIT_TOKEN
)



# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 16
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.num_train_epochs = 5

tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")

model = NERModel(
    "camembert", "airesearch/wangchanberta-base-att-spm-uncased", 
    # "bert", "monsoon-nlp/bert-base-thai", 
    args = ner_args, 
    use_cuda = torch.cuda.is_available(), 
    labels = _NER_TAGS
    # model_args.lazy_loading = True

)

# Train the model
model.train_model(train_, eval_data=validation_, verbose=True)

## eval test

In [None]:
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/Model')

In [None]:
# Evaluate the model
result, model_outputs, preds_list = model.eval_model(test_)

In [None]:
result

In [None]:
test__ = list(test_['words'])
test__

In [None]:
predictions, raw_outputs = model.predict(test__)

In [None]:
predictions

In [None]:
for i, s in enumerate(predictions):
  print(i,s)

In [None]:
result,predictions, raw_outputs = model.predict(word_list)

## BI-LSTM

# submission

In [None]:
labels = []
for i, p in enumerate(predictions):
  if len(p) > 0:
    for k, v in p[0].items():
      j = v
  else:
    j = 'O'

  labels.append([i+1,j])
print(len(labels) , labels[:10])

In [None]:
sub = pd.DataFrame(labels, columns=['Id', 'Predicted'])
sub

In [None]:
sub.to_csv('submission.csv', columns=None, header=True, index=False)

In [None]:
csv_sub = pd.read_csv('submission.csv')
csv_sub