### 【 D0127_Mini-Project_이준기 】

- 민원 처리 (NLP)
- 텍스트 -> Task 분류

In [16]:
## 모듈 로딩
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

import pandas as pd

In [None]:
## 딥러닝 모듈 로딩
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from   torch.optim.lr_scheduler import ReduceLROnPlateau

from   torch.utils.data import Dataset, TensorDataset, DataLoader
from   torch.utils.data import random_split
from   sklearn.model_selection import train_test_split

import util_func as uf

import pandas as pd

In [40]:
## 데이터 준비
DATA_FILE = '../Project/PrData/train.csv'

df = pd.read_csv(DATA_FILE)

df.nunique()

NUM_CLASSES = df['label'].nunique()

In [18]:
## 전처리 함수 생성
from konlpy.tag import Okt
import re

okt = Okt()

stopwords = ['합니다', '바랍니다', '부탁', '요청', '제발', '주세요', '하십시오']

def preprocessing(text):
    text = re.sub('[^가-힣 ]', ' ', text) # 한글 + 공백만 남기기

    nouns = okt.nouns(text)

    nouns = [word for word in nouns if word not in stopwords and len(word) > 1]

    return ' '.join(nouns)

In [19]:
## 전처리 잘 되는지 임시 텍스트로 테스트해보기
# text = "초등학교 바로 근처에 피부샵 간판에 '브라질리언'이라고 써있으니 초등생 아이가 그게 뭐냐고 물어봅니다. 선뜻 대답하기 어려웠고 요즘 아이들은 거의 다 스마트폰이 있으니 검색해 보고 음란 사진에 접할까 봐 염려되니 제재 바랍니다."

# text = preprocessing(text)

# print(text)

In [20]:
## 전처리 적용
df['clean_text'] = df['text'].apply(preprocessing)

In [41]:
df_s = df.iloc[:, 1:]
df_s

Unnamed: 0,label
0,건축허가
1,건축허가
2,건축허가
3,건축허가
4,건축허가
...,...
799995,환경미화
799996,환경미화
799997,환경미화
799998,환경미화


In [42]:
print(df_s.shape)

(800000, 1)


In [43]:
## ----------------------------------------------------------
## 비율 확인
## ----------------------------------------------------------
df_s['clean_text'].value_counts(normalize=True)

KeyError: 'clean_text'

In [None]:
## 타겟 컬럼 라벨 인코딩
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_s['label_idx'] = le.fit_transform(df_s['label'])


18


In [None]:
from collections import Counter

counter = Counter()
for text in df_s['clean_text']:
    counter.update(text.split())

vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

VOCAB_SIZE = len(vocab)
print("VOCAB SIZE:", VOCAB_SIZE)


In [None]:
MAX_LEN = 50

def text_to_seq(text):
    return [vocab.get(word, 1) for word in text.split()]

def pad_seq(seq, max_len=MAX_LEN):
    return seq[:max_len] + [0] * (max_len - len(seq))

df_s['seq'] = df_s['clean_text'].apply(text_to_seq)
df_s['seq_pad'] = df_s['seq'].apply(pad_seq)


In [None]:
## ----------------------------------------------------------
## 데이터 나누기
## ----------------------------------------------------------
featureDF = pd.DataFrame(df_s['seq_pad'].to_list())
targetDF  = df_s[['label_idx']]

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

BS      = 64
EPOCHS = 10
LR     = 1e-3
MAX_LEN = 50

In [None]:
class TextDataset(Dataset):

    def __init__(self, featureDF, targetDF):
        self.xTS = torch.tensor(featureDF.values, dtype=torch.long)
        self.yTS = torch.tensor(targetDF.values.squeeze(), dtype=torch.long)
        self.length = featureDF.shape[0]

    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        return self.xTS[idx], self.yTS[idx]


In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(
    featureDF,
    targetDF,
    test_size=0.2,
    random_state=10,
    stratify=targetDF
)

trainDS = TextDataset(x_train, y_train)
validDS = TextDataset(x_valid, y_valid)

trainDL = DataLoader(trainDS, batch_size=BS, shuffle=True)
validDL = DataLoader(validDS, batch_size=BS)


In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)     # (B, T, E)
        _, (h, _) = self.lstm(x)
        out = self.fc(h[-1])      # (B, C)
        return out


In [None]:
model = TextClassifier(
    vocab_size=VOCAB_SIZE,
    embed_dim=128,
    hidden_dim=128,
    num_classes=NUM_CLASSES
).to(DEVICE)

loss_fn   = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)


In [None]:
history = {
    "train_loss": [],
    "val_loss": [],
    "train_f1": [],
    "val_f1": []
}


In [None]:
BEST_F1 = 0.0

for epoch in range(EPOCHS):

    # ======================
    # Train
    # ======================
    model.train()
    train_loss = 0
    y_true, y_pred = [], []

    for x, y in trainDL:
        x, y = x.to(DEVICE), y.to(DEVICE)

        optimizer.zero_grad()
        logits = model(x)
        loss = loss_fn(logits, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * x.size(0)

        preds = torch.argmax(logits, dim=1)
        y_true.extend(y.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

    train_loss /= len(trainDL.dataset)
    train_f1 = f1_score(y_true, y_pred, average="macro")

    # ======================
    # Validation
    # ======================
    model.eval()
    valid_loss = 0
    y_true, y_pred = [], []

    with torch.no_grad():
        for x, y in validDL:
            x, y = x.to(DEVICE), y.to(DEVICE)
            logits = model(x)
            loss = loss_fn(logits, y)

            valid_loss += loss.item() * x.size(0)

            preds = torch.argmax(logits, dim=1)
            y_true.extend(y.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())

    valid_loss /= len(validDL.dataset)
    valid_f1 = f1_score(y_true, y_pred, average="macro")

    history["train_loss"].append(train_loss)
    history["val_loss"].append(valid_loss)
    history["train_f1"].append(train_f1)
    history["val_f1"].append(valid_f1)

    print(
        f"[{epoch+1:02d}] "
        f"LOSS train/valid = {train_loss:.4f} / {valid_loss:.4f} | "
        f"F1 train/valid = {train_f1:.4f} / {valid_f1:.4f}"
    )

    if valid_f1 > BEST_F1:
        BEST_F1 = valid_f1
        torch.save(model.state_dict(), "best_text_model.pth")
