### 【 D0122_work_이준기 】

[26_01_22_과제]
- 알파벳을 사용하는 언어는 알파벳 빈도의 차이로 언어를 식별할 수 있습니다.
- 해당 데이터셋을 활용해서 언어 식별 모델을 생성하세요.
- 데이터셋
  * train 폴더 =>  나라영문2글자-숫자.txt
  * test 폴더  =>  나라영문2글자-숫자.txt

- 데이터셋 부족 시 Wikipedia 사이트에서 추가 가능 합니다

우선 데이터 파일인 txt 파일 -> csv 파일로 바꾸어야 하는데, 알파벳 비율로 해서 불러오기 

In [None]:
import os                           # 폴더 안 파일 목록 가져오기 위한 운영체제
import csv                          # CSV 파일을 표 형식으로 저장
from collections import Counter     # 리스트 안 값의 빈도 계산 전용
import string                       # 

# 문서 하나를 입력받아 알파벳 26개의 비율을 반환하는 함수
def get_alphabet_ratios(text):
    alphabets = [char.lower() for char in text if char.isalpha()]   # 알파벳만 추출
    # 빈 문서 예외 처리
    if not alphabets:
        return {letter: 0 for letter in string.ascii_lowercase}
    
    # 알파벳 빈도 계산
    frequency = Counter(alphabets)
    total = len(alphabets)

    # 비율 계산
    return {letter: round(frequency.get(letter, 0) / total, 4) 
            for letter in string.ascii_lowercase}


# CSV 생성해주는 함수
def create_frequency_csv(folder_path, output_csv):
    rows = []   # 결과 저장용 리스트
    
    # 폴더 안에 있는 파일 순회하기
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith('.txt'):
            filepath = os.path.join(folder_path, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                text = f.read()
            
            ratios = get_alphabet_ratios(text)
            # 행 만들기
            row = {'filename': filename, **ratios} 
            rows.append(row)
    
    # CSV 파일 저장
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['filename'] + list(string.ascii_lowercase))
        writer.writeheader()
        writer.writerows(rows)
    
    print(f"생성 완료: {output_csv}")

# 실행
create_frequency_csv('./dataset/train/', 'train_frequency.csv')
create_frequency_csv('./dataset/test/', 'test_frequency.csv')


생성 완료: train_frequency.csv
생성 완료: test_frequency.csv


In [None]:
## 사용해야 할 모듈 로딩
import pandas as pd                                 # 데이터프레임 생성
from sklearn.preprocessing import LabelEncoder      # 파일명 라벨인코딩
import torch                                        # 텐서 및 수치, 기본 함수용 모듈
import torch.nn as nn                               # 인공신경망 관련 모듈
import torch.nn.functional as F                     # 인공신경망 함수 관련 모듈

In [None]:
## CSV 파일 불러오기
trainDF = pd.read_csv('train_frequency.csv')
testDF = pd.read_csv('test_frequency.csv')

In [23]:
## 앞의 en, fr, id, tl만 사용할 거라서 슬라이싱을 통해서 잘라주기
trainDF['filename'] = trainDF['filename'].str[:2]
testDF['filename'] = testDF['filename'].str[:2]
trainDF.head()

Unnamed: 0,filename,a,b,c,d,e,f,g,h,i,...,q,r,s,t,u,v,w,x,y,z
0,en,0.076,0.0128,0.0457,0.0461,0.1053,0.0157,0.0192,0.0437,0.074,...,0.0,0.0777,0.0614,0.0805,0.0259,0.0098,0.0141,0.0007,0.02,0.0004
1,en,0.084,0.0199,0.0303,0.0388,0.1367,0.0174,0.0312,0.0274,0.0752,...,0.0055,0.0899,0.0715,0.0776,0.0306,0.0137,0.0139,0.002,0.0107,0.0006
2,en,0.0716,0.0122,0.0456,0.0326,0.1201,0.0147,0.0252,0.0235,0.0946,...,0.0017,0.0539,0.088,0.0811,0.029,0.0188,0.0119,0.0006,0.018,0.0006
3,en,0.072,0.0276,0.0299,0.0395,0.1207,0.0167,0.0235,0.0588,0.065,...,0.0004,0.059,0.0731,0.0934,0.0242,0.0051,0.0195,0.006,0.0175,0.0017
4,en,0.0738,0.0204,0.0311,0.0396,0.1413,0.0204,0.0204,0.0569,0.065,...,0.0004,0.0725,0.0596,0.0955,0.025,0.0107,0.0239,0.0031,0.0149,0.0007


In [24]:
## Feature과 Label 분리
X_train = trainDF.loc[:, 'a':'z'].values
y_train = trainDF['filename'].values

X_test = testDF.loc[:, 'a':'z'].values
y_test = testDF['filename'].values


In [None]:
## 라벨 인코딩을 통해 숫자로 바꿔주기
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

In [None]:
## Tensor로 변환
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train_enc, dtype=torch.long)

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test_enc, dtype=torch.long)

In [None]:
## 모델 생성
class LangModel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(26, 64)
        self.fc2 = nn.Linear(64, 32)
        self.out = nn.Linear(32, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)


In [None]:
## 손실함수 & 최적화
model = LangModel(num_classes=len(le.classes_))

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [30]:
EPOCHS = 1000

for epoch in range(EPOCHS):
    optimizer.zero_grad()      # 1. 이전 기울기 제거
    logits = model(X_train)    # 2. 순전파
    loss = loss_fn(logits, y_train)  # 3. 손실 계산
    loss.backward()            # 4. 역전파
    optimizer.step()           # 5. 파라미터 업데이트

    if epoch % 100 == 0:
        print(f"[{epoch}] loss: {loss.item():.6f}")


[0] loss: 1.388657
[100] loss: 1.202458
[200] loss: 0.695503
[300] loss: 0.482858
[400] loss: 0.222412
[500] loss: 0.080891
[600] loss: 0.030978
[700] loss: 0.014881
[800] loss: 0.008250
[900] loss: 0.005367


In [31]:
with torch.no_grad():
    pred_train = model(X_train).argmax(dim=1)
    train_acc = (pred_train == y_train).float().mean()

    pred_test = model(X_test).argmax(dim=1)
    test_acc = (pred_test == y_test).float().mean()

print(f"Train Acc: {train_acc:.4f}")
print(f"Test  Acc: {test_acc:.4f}")


Train Acc: 1.0000
Test  Acc: 0.8750
