In [None]:
!pip install pandas

In [None]:
import os
from tqdm import tqdm
import pandas as pd
import re

##데이터 로드

In [None]:
file_path = 'rocket_profile.csv'
train = pd.read_csv(file_path, encoding='utf-8', sep='`')
train.info()

##데이터 전처리

In [None]:
df = train
df.drop(['이름'], axis=1, inplace=True)
df.head()

In [None]:
def normalize_text(text):
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  return text

In [None]:
for i in range(df.shape[0]):
  for j in range(df.shape[1]):
    df.iat[i,j] = normalize_text((str(df.iat[i,j])))
df.tail()

In [None]:
!pip install datasets
!pip install -U scikit-learn

In [None]:
import datasets
from datasets import load_metric, Dataset, load_dataset

In [None]:
# train, valid 데이터 분리
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
        df,
        random_state=210,
        test_size=0.1,
        )

train_df.head()

In [None]:
# DatasetDict 타입으로 변환
train_dataset = Dataset.from_dict(train_df)
valid_dataset = Dataset.from_dict(valid_df)
dataset = datasets.DatasetDict({"train":train_dataset,"valid":valid_dataset})
dataset

##토큰화

In [None]:
!pip install transformers
!pip install 'git+https://git@github.com/SKTBrain/KoBERT.git@master'
!pip install 'git+https://github.com/SKTBrain/KOBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
import numpy as np
import pickle
import torch,gc
from kobert_tokenizer import KoBERTTokenizer
from kobert import get_pytorch_kobert_model
from transformers import RobertaModel, TrainingArguments, Trainer

In [None]:
from selenium import webdriver

print ("Selenium webdriver Version: %s" % (webdriver.__version__))

In [None]:
batch_size = 16
epoch_num = 8
MAX_LEN = 128
metric = load_metric("accuracy")
metric_name = "accuracy"

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tokenizer.truncation_side = 'left'

In [None]:
def tokenize_function(sample):
    return tokenizer(
        sample['프로젝트'],
        padding=True,
        max_length=MAX_LEN,
        truncation=True,
        return_token_type_ids=False,
        return_attention_mask=False,
        # return_length=True,
    )

In [None]:
# 함수 테스트
tokenize_function(dataset["train"][:5])

In [None]:
# 토큰화
encoded_dataset = dataset.map(tokenize_function, remove_columns=['관심분야', '소개', '경력', '활동분야', '프로젝트'], batched=True)

In [None]:
with open("encoded_dataset", "wb" ) as file:
  pickle.dump(encoded_dataset, file)

In [None]:
with open("encoded_dataset", "rb" ) as file:
  encoded_dataset = pickle.load(file)
  print(encoded_dataset)

##모델 구성

In [None]:
# 모델 로드
model, vocab  = get_pytorch_kobert_model()

In [None]:
args = TrainingArguments(
    "test_koBERT",
    evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch_num,
    weight_decay=0.01,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    remove_unused_columns=False,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()
trainer.save_model("koBERT_model1")

##TEST

In [None]:
file_path = "test.csv"
test = pd.read_csv(file_path, encoding='utf-8', sep='`')
test.info()

In [None]:
# test.csv 전처리
test.drop(['이름'], axis=1, inplace=True)

for i in range(test.shape[0]):
  for j in range(test.shape[1]):
    test.iat[i,j] = normalize_text((str(test.iat[i,j])))
test

In [None]:
test.to_csv('preprocessed_test.csv',index=False)

In [None]:
TEST = "preprocessed_test.csv"
test_dataset = load_dataset("csv", data_files=TEST)['train']
test_dataset = test_dataset.map(tokenize_function, remove_columns=['관심분야', '소개', '경력', '활동분야', '프로젝트'])

In [None]:
predictions = trainer.predict(test_dataset)
with open("test_result.txt", "w") as fi:
    fi.write(predictions)