In [1]:
import os
import pandas as pd
import torch
import RNNutils as utils

In [2]:
DATA_PATH='data/'
file_list=os.listdir(DATA_PATH)[:-1]
print(file_list)

['0', '1', '2', '3', '4', '5', '6', '7']


In [3]:
texts=[]
labels=[]
for label in file_list:
    text_list=os.listdir(f'{DATA_PATH}/{label}')
    for tt in text_list:
        with open(f'{DATA_PATH}/{label}/{tt}') as f:
            a=f.read()
            texts.append(a)
            labels.append(int(label))

In [4]:
print(f'texts number : {len(texts)}')
print(f'labels number : {len(labels)}')

texts number : 1600
labels number : 1600


In [5]:
data=pd.DataFrame()
data['text']=texts
data['label']=labels
data

Unnamed: 0,text,label
0,"동남아 담당' 北 최희철 부상 베이징 도착…싱가포르행 주목\t최 부상, 행선지·방문...",0
1,"예결위, 추경 막바지 심사 진통…여야 충돌\t(서울=연합뉴스) 김남권 기자 = 국회...",0
2,외압 논란·항명 사태…산 넘고 물 건넌 권성동 영장 청구\t안미현 검사 외압 폭로 ...,0
3,"친문 홍영표, 문빠에 찍혔다…특검 합의에 문자폭탄 공격\t대표적인 친(親)문재인계인...",0
4,"北, 연일 南비난…韓美정상회담 전 경고성 메시지 발신\t南, 맥스선더·태영호 등 불...",0
...,...,...
1595,"단일팀 추진' 대한카누연맹, 데상트코리아와 5년 후원 협약\t[스포티비뉴스=조형애 ...",7
1596,"올림픽 성공 뒷이야기... 서울대, 16일 이희범 평창 조직위원장 초청 특강\t[O...",7
1597,21일 개막 호치민3쿠션월드컵에 149명 참가 확정\t韓선수 46명 참가…1차 예선...,7
1598,"스포츠안전재단, 대축전에 안전필요성 알려\t[스타뉴스 채준 기자]\n\n\n스포츠안...",7


In [6]:
# 데이터셋 분리

train=data.sample(frac=0.9,random_state=609)
test=data.drop(train.index)
print(f'train shape {train.shape}')
print(f'test shape {test.shape}')

train shape (1440, 2)
test shape (160, 2)


In [7]:
# 토큰화, 불용어 제거
from konlpy.tag import Okt

tokenizer=Okt()

traintoken=utils.getToken(train.text,tokenizer)
testtoken=utils.getToken(test.text,tokenizer)

In [8]:
# 단어사전 생성 및 넘버링
vocab=utils.get_vocab(traintoken,n_vocab=10000)

token_to_idx={token:idx for idx,token in enumerate(vocab)}
idx_to_token={idx:token for idx,token in enumerate(vocab)}

print(token_to_idx)

{'<pad>': 0, '<oov>': 1, '일': 2, '있다': 3, '이': 4, '하다': 5, '년': 6, '돼다': 7, '않다': 8, '것': 9, '으로': 10, '말': 11, '전': 12, '월': 13, '없다': 14, '받다': 15, '보다': 16, '기자': 17, '의': 18, '명': 19, '에': 20, '한국': 21, '수': 22, '을': 23, '밝히다': 24, '서울': 25, '은': 26, '늘다': 27, '가다': 28, '오다': 29, '위': 30, '지난': 31, '이라고': 32, '따르다': 33, '이다': 34, '는': 35, '에는': 36, '날': 37, '인': 38, '아니다': 39, '되다': 40, '북한': 41, '시간': 42, '고': 43, '중': 44, '같다': 45, '대해': 46, '원': 47, '대한': 48, '미국': 49, '성': 50, '당': 51, '를': 52, '대통령': 53, '통해': 54, '이번': 55, '분': 56, '중국': 57, '관련': 58, '때문': 59, '크다': 60, '사람': 61, '경찰': 62, '나오다': 63, '가': 64, '보이다': 65, '라고': 66, '이후': 67, '장': 68, '정부': 69, '그': 70, '대표': 71, '조사': 72, '많다': 73, '사실': 74, '문제': 75, '후보': 76, '함께': 77, '대회': 78, '경우': 79, '상황': 80, '관계자': 81, '김': 82, '라며': 83, '점': 84, '지난해': 85, '대다': 86, '들다': 87, '혐의': 88, '진행': 89, '의원': 90, '최근': 91, '연': 92, '현재': 93, '확인': 94, '시장': 95, '좋다': 96, '도': 97, '자신': 98, '주장': 99, '달': 100, '열리다': 101, '예정

In [9]:
trainnum=utils.padding_vectorize(traintoken,token_to_idx,pad_length=100)
testnum=utils.padding_vectorize(testtoken,token_to_idx,pad_length=100)

In [10]:
import torch

train_text=torch.tensor(trainnum)
train_label=torch.FloatTensor(train.label.values)

test_text=torch.tensor(testnum)
test_label=torch.FloatTensor(test.label.values)


In [11]:
from torch.utils.data import TensorDataset,DataLoader

trainDS=TensorDataset(train_text,train_label)
testDS=TensorDataset(test_text,test_label)

trainDL=DataLoader(trainDS,batch_size=16)
testDL=DataLoader(testDS,batch_size=len(testDS))

In [18]:
model=utils.SentenceClassifier(n_vocab=len(token_to_idx),hidden_dim=128,embedding_dim=64,n_layers=4,feature_n=len(file_list),dropout=0.75)

import torch.optim as optim
import torch.nn as nn
from torchmetrics.classification import MulticlassF1Score
import torch.optim.lr_scheduler as lr_scheduler

optimizer=optim.Adam(model.parameters(),lr=0.001)
lossfun=nn.CrossEntropyLoss()
scorefun=MulticlassF1Score(num_classes=len(file_list))
scheduler=lr_scheduler.ReduceLROnPlateau(optimizer,patience=10,factor=0.2,mode='max')


In [None]:
EPOCH=100
TV=utils.Train_val(trainDL,testDL,model,optimizer,lossfun,scorefun)
train=TV.train(EPOCH,scheduler,1)

[1/100]
train loss 2.084336611959669, train score 0.06729557654923862
test loss 2.0833353996276855, test score 0.02777777798473835
scheduler.num_bad_epochs 0/10
[2/100]
train loss 2.0732486804326373, train score 0.05688813890640934
test loss 2.115917444229126, test score 0.05732716619968414
scheduler.num_bad_epochs 0/10
[3/100]
train loss 2.0617363651593528, train score 0.07423974575681819
test loss 2.0982398986816406, test score 0.06480655074119568
scheduler.num_bad_epochs 0/10
[4/100]
train loss 2.0380484647221033, train score 0.08565807797842556
test loss 2.048877716064453, test score 0.07866300642490387
scheduler.num_bad_epochs 0/10
[5/100]
train loss 1.9888842317793105, train score 0.10941580581582255
test loss 2.0010218620300293, test score 0.1257525533437729
scheduler.num_bad_epochs 0/10
[6/100]
train loss 1.8976804044511584, train score 0.14168372895154688
test loss 1.9744577407836914, test score 0.13449358940124512
scheduler.num_bad_epochs 0/10
[7/100]
train loss 1.85296386082

[20/100]
train loss 0.29009385150339867, train score 0.873984322945277  
test loss 2.666813373565674, test score 0.5284493565559387  
scheduler.num_bad_epochs 0/10  

2번  
[41/100]
train loss 0.0016216536575585552, train score 0.9968253976768917  
test loss 3.6139883995056152, test score 0.4708749055862427  
scheduler.num_bad_epochs 0/10  