### 알파벳 기반 언어 식별 모델
- 데이터셋 : lang.zip 사용해서 생성 -> csv,json....
- 학습방벙 : 지도학습 - 분류 : 다중분류
- 알고리즘 : 다양하게

[1] 모듈 로딩 및 데이터 준비

In [2663]:
# 모듈로딩
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
import torch.optim as optim
from torchmetrics.classification import MulticlassF1Score,F1Score
from torchinfo import summary
import utils

In [2664]:
# 알파벳 분해 함수 생성
def file_make(train):
    if os.path.exists(train):

        files=os.listdir(f'{train}/')
        alphabetDF=pd.DataFrame(columns=[chr(i) for i in range(ord('a'),ord('z')+1)])
        target=[]
        for file in files:
            with open(f'{train}/{file}',mode='r') as f:
                data=f.read()

                # 소문자로
                data=data.lower()

                data=data.replace('\n','')

                for cha in data:
                    if ord('a')>ord(cha) or ord(cha)>ord('z'):
                        data=data.replace(cha,'')
                values=pd.Series(list(data)).value_counts()
                alphabet_prob=pd.DataFrame(values/values.sum())
                alphabet_prob.columns=[file]
                alphabetDF=pd.concat([alphabetDF,alphabet_prob.T])
                label=file[:2]
                target.append(label)
        alphabetDF['target']=target
        return alphabetDF.to_csv(f'{train}.csv')

    else:
        print(f'{train} 폴더가 존재하지 않습니다.')


In [2665]:
# # csv 파일생성
# file_make('train')
# file_make('test')

In [2666]:
# 파일을 불러서 feature와 target분리후 인코딩
def dataload(filename):
    dataDF=pd.read_csv(filename,index_col=[0])
    targetDF=dataDF[dataDF.columns[-1]]
    featureDF=dataDF[dataDF.columns[:-1]]
    return featureDF,targetDF

In [2667]:
train_featureDF,train_targetDF=dataload('train.csv')
test_featureDF,test_targetDF=dataload('test.csv')
train_featureDF=train_featureDF.fillna(0)
test_featureDF=test_featureDF.fillna(0)
print(f'train_feature : {train_featureDF.shape}, train_target : {train_targetDF.shape}')
print(f'test_feature : {test_featureDF.shape}, test_target : {test_targetDF.shape}')

train_feature : (20, 26), train_target : (20,)
test_feature : (8, 26), test_target : (8,)


In [2668]:
# 라벨인코딩
encoder=LabelEncoder()
encoder.fit(train_targetDF)
train_target=encoder.transform(train_targetDF)
test_target=encoder.transform(test_targetDF)
train_target

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])

In [2669]:
test_target

array([0, 0, 1, 1, 2, 2, 3, 3])

In [2670]:
# target 레이블수 확인
target_num=len(pd.Series(train_target).unique())
target_num

4

In [2671]:
train_featureTS=torch.FloatTensor(train_featureDF.values)
train_targetTS=torch.FloatTensor(train_target)
test_featureTS=torch.FloatTensor(test_featureDF.values)
test_targetTS=torch.FloatTensor(test_target)

In [2672]:
model=utils.AlphaMCFModel(train_featureTS.shape[1],target_num,300,200)

In [2673]:
summary(model,input_size=train_featureTS.shape)

Layer (type:depth-idx)                   Output Shape              Param #
AlphaMCFModel                            [20, 4]                   --
├─Linear: 1-1                            [20, 300]                 8,100
├─Linear: 1-2                            [20, 200]                 60,200
├─Linear: 1-3                            [20, 4]                   804
Total params: 69,104
Trainable params: 69,104
Non-trainable params: 0
Total mult-adds (M): 1.38
Input size (MB): 0.00
Forward/backward pass size (MB): 0.08
Params size (MB): 0.28
Estimated Total Size (MB): 0.36

In [2674]:
optimizer=optim.Adam(model.parameters(),lr=0.01)

In [2675]:
EPOCH=100

In [2676]:
scorefu=MulticlassF1Score(num_classes=target_num)

In [2677]:
loss_history=[]
score_history=[]
breaknum=0
for epoch in range(EPOCH):
    model.train()

    pre_y=model(train_featureTS)

    # loss 값
    loss=nn.CrossEntropyLoss()(pre_y,train_targetTS.reshape(-1).long())
    # score 값
    score=scorefu(pre_y,train_targetTS.reshape(-1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss_history.append(loss.item())
    score_history.append(score.item())
    print(f'[{epoch+1}/{EPOCH}] loss : {loss}, score :{score}')
    if len(score_history)==1:
        torch.save(model,'best_model.pth')
        torch.save(model.state_dict(),'best_weight.pth')

    else:
        if score_history[-1] <= score_history[-2]:
            breaknum+=1
        elif score_history[-1]>=max(score_history):
            torch.save(model,'best_model.pth')
            torch.save(model.state_dict(),'best_weight.pth')

        
        if breaknum==5:
            break


[1/100] loss : 1.387414574623108, score :0.10000000149011612
[2/100] loss : 1.3735729455947876, score :0.33936652541160583
[3/100] loss : 1.349753737449646, score :0.42627960443496704
[4/100] loss : 1.314959168434143, score :0.3333333432674408
[5/100] loss : 1.2596139907836914, score :0.5784721970558167
[6/100] loss : 1.1850720643997192, score :0.3333333432674408
[7/100] loss : 1.0914340019226074, score :0.5018315315246582
[8/100] loss : 0.9860490560531616, score :0.5625
[9/100] loss : 0.881621241569519, score :0.3333333432674408
[10/100] loss : 0.7878174185752869, score :0.3333333432674408
[11/100] loss : 0.7126022577285767, score :0.8351648449897766
[12/100] loss : 0.6520923376083374, score :0.9494949579238892
[13/100] loss : 0.6063429713249207, score :0.9494949579238892


In [2678]:
BestModel=torch.load('best_model.pth',weights_only=False)
test_y=BestModel(test_featureTS)
scorefu(test_y,test_targetTS.reshape(-1))

tensor(0.8667)