# load data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
!wget https://storage.googleapis.com/ss4-exp-datasource/legalAct/legal-act-classification.zip

In [None]:
!mkdir dataset
!unzip /content/legal-act-classification.zip -d dataset

In [None]:
df_train = pd.read_csv("/content/dataset/train.csv")
df_train.head()

In [None]:
df_test= pd.read_csv("/content/dataset/test.csv")
df_test.head()

In [None]:
df_raw = pd.read_csv('dataset/raw.csv')
df_raw.head()

In [None]:
df_committee = pd.read_csv('dataset/committee.csv')
df_committee.head()

# EDA

## train

In [None]:
df_train.columns

In [None]:
df_train.info()

In [None]:
df_train[df_train['legal_act'].isnull()]

In [None]:
df_train['context'].value_counts()

In [None]:
df_train['pattern'].value_counts()

In [None]:
df_train['legal_act'].value_counts()

In [None]:
df_train['condition'].value_counts()

## test

In [None]:
df_test.info()

In [None]:
df_test['context'].value_counts()

In [None]:
df_test['pattern'].value_counts()

In [None]:
df_test['legal_act'].value_counts()

In [None]:
df_test['condition'].value_counts()

# Modeling

## Process

In [None]:
df_train['condition'] = df_train['condition'].fillna("ไม่มีเงื่อนไข")

In [None]:
df_train['legal_act'] = df_train['legal_act'].fillna("ไม่มีการกระทำ")

In [None]:
df_test['condition'] = df_test['condition'].fillna("ไม่มีเงื่อนไข")

In [None]:
from sklearn.model_selection import train_test_split

train, validation = train_test_split(df_train, test_size=0.25, random_state=888, stratify=df_train.answer)
train.shape, validation.shape

In [None]:
!pip install datasets

In [None]:
import datasets

train_set = datasets.Dataset.from_dict(
    {
        "question": train['question'],
        "context": train['context'],
        "answer": train['answer'],
        # "legal_act": train['legal_act'],
        # "condition": train['condition']
    }
)

val_set = datasets.Dataset.from_dict(
    {
        "question": validation['question'],
        "context": validation['context'],
        "answer": validation['answer'],
        # "legal_act": validation['legal_act'],
        # "condition": validation['condition']
    }
)

train_set, val_set

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("studio-ousia/mluke-base")
model = AutoModelForSequenceClassification.from_pretrained("studio-ousia/mluke-base")

In [None]:
train_set[0]

In [None]:
[q.strip() for q in ["นางสาวนภัสกร แซ่เนี้ยว ลงลายมือชื่อ เว้นแต่การทำธุรกรรมทางการเงินให้นางสาวนภัสกร แซ่เนี้ยว และ นายอภิวิชษ์ สายภู่ ลงลายมือชื่อร่วมกัน"]]

In [None]:
def preprocess_function(examples):
    contexts = [c.strip() for c in examples["context"]]
    inputs = tokenizer(
        contexts,
        examples["question"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = examples["answer"]
    return inputs

tokenized_train = train_set.map(preprocess_function, batched=True, remove_columns=train_set.column_names)
tokenized_validation = val_set.map(preprocess_function, batched=True, remove_columns=val_set.column_names)
tokenized_train

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()
data_collator

## Training

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="mruk_law",
    evaluation_strategy="epoch",
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    push_to_hub=True,
    logging_dir="logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
trainer.push_to_hub()

# Evaluation

In [None]:
!pip install transformers[torch]
!pip install accelerate -U

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nattawatWe/legal_exp")
model = AutoModelForSequenceClassification.from_pretrained("nattawatWe/legal_exp", device_map="cuda:0")

In [None]:
val_set[0]

In [None]:
def preprocess_function(examples):
    contexts = [c.strip() for c in examples["context"]]
    inputs = tokenizer(
        contexts,
        examples["question"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    inputs["labels"] = examples["answer"]
    return inputs

tokenized_validation = val_set.map(preprocess_function, batched=True, remove_columns=val_set.column_names)
tokenized_validation

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm.auto import tqdm

data_collator = default_data_collator
validation_dataloader = DataLoader(tokenized_validation, collate_fn=data_collator, batch_size=8)

predictions = []
for batch in tqdm(validation_dataloader):
    with torch.no_grad():
        inputs = {key: batch[key].to(model.device) for key in batch}
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1)
    predictions.extend(predicted_class.tolist())

print(predictions)

In [None]:
TP = 0
FP = 0
FN = 0

for i in range(len(validation[:]['answer'])):
    if val_set[i]['answer'] == predictions[i]:
        TP += 1
    else:
        FP += 1
        FN += 1

precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1_score = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

# Predict

In [None]:
import datasets

test = datasets.Dataset.from_dict(
    {
        "question": df_test['question'],
        "context": df_test['context'],
    }
)

test

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm.auto import tqdm

def preprocess_function(examples):
    contexts = [c.strip() for c in examples["context"]]
    inputs = tokenizer(
        contexts,
        examples["question"],
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    return inputs

tokenized_test = test.map(preprocess_function, batched=True, remove_columns=test.column_names)
tokenized_test

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
data_collator = default_data_collator
test_dataloader = DataLoader(tokenized_test, collate_fn=data_collator, batch_size=64)

predictions = []
for batch in tqdm(test_dataloader):
    with torch.no_grad():
        inputs = {key: batch[key].to(model.device) for key in batch}
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1)
    predictions.extend(predicted_class.tolist())

print(predictions)

In [None]:
submission = pd.read_csv('/content/dataset/sample_submission.csv')
submission

In [None]:
submission['answer'][3:] = predictions[3:]
submission

In [None]:
submission['answer'] = submission['answer'].astype(int)

In [None]:
submission['answer'] = submission['answer'].astype(str)

In [None]:
submission

In [None]:
submission.to_csv('the_art_of_state.csv',index = False)