In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

raw_data_path = '/ntdev/youngri/YR_LAB/pytorch_textclassification/dataset/news.csv'
destination_folder = '/ntdev/youngri/YR_LAB/pytorch_textclassification/destination'

train_test_ratio = 0.1
train_valid_ratio = 0.8

first_n_words = 200

# 빠른 학습을 위해 데이터를 자를 함수 생성
def trim_string(x):
    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])
    return x



In [2]:
# 원본 데이터
df_raw = pd.read_csv(raw_data_path)
df_raw

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [3]:
# 열 정리
df_raw['label'] = (df_raw['label'] == 'FAKE').astype('int') # REAL to 0 and FAKE to 1
df_raw['titletext'] = df_raw['title'] + "." + df_raw['text']
df_raw = df_raw.reindex(columns=['label', 'title', 'text', 'titletext'])

# 빈 값 삭제
df_raw.drop(df_raw[df_raw.text.str.len() < 5].index, inplace=True)

# 트림 함수 적용
df_raw['text'] = df_raw['text'].apply(trim_string)
df_raw['titletext'] = df_raw['titletext'].apply(trim_string)

# 라벨으로 나누기
df_real = df_raw[df_raw['label'] == 0]
df_fake = df_raw[df_raw['label'] == 1]

# t / tr split
df_real_full_train, df_real_test = train_test_split(df_real, train_size=train_test_ratio, random_state=1)
df_fake_full_train, df_fake_test = train_test_split(df_fake, train_size=train_test_ratio, random_state=1)

# t / v split
df_real_train, df_real_valid = train_test_split(df_real_full_train, train_size = train_valid_ratio, random_state=1)
df_fake_train, df_fake_valid = train_test_split(df_fake_full_train, train_size = train_valid_ratio, random_state=1)

# tr/ v/ t concatenate
df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_real_valid, df_fake_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

# save df data
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)

print('-- done --')

-- done --


In [4]:
df_train[:2]

Unnamed: 0,label,title,text,titletext
0,0,Obamacare's unlikely No. 1 city,"Killing Obama administration rules, dismantlin...",Obamacare's unlikely No. 1 city.Killing Obama ...
1,0,New York restores order for 2016 front-runners,Hillary Clinton and Donald Trump scored resoun...,New York restores order for 2016 front-runners...


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
 
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import spacy

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
device = torch.device('cuda')

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [9]:
# Fields 이용해 csv 파일을 사용할 수 있도록 tr/v/t 으로 나누면서 설정

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)
fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)]

# TabularDataset으로 테이블 형태로 데이터 설정
train, valid, test = TabularDataset.splits(path=destination_folder, train='train.csv', validation='valid.csv', test='test.csv',
                                            format='CSV', fields=fields, skip_header=True)

# iterators 생성-batch learning 기능. 비슷한 길이를 가진 텍스트를 한 배치에 할당하여 패딩을 최소화
train_iter = BucketIterator(train, batch_size=32, sort_key=lambda x: len(x.text), device=device, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=32, sort_key=lambda x: len(x.text), device=device, sort=True, sort_within_batch=True)
test_iter = BucketIterator(test, batch_size=32, sort_key=lambda x: len(x.text), device=device, sort=True, sort_within_batch=True)

# 사전 생성(최소 빈도 3이상 경우만)
text_field.build_vocab(train, min_freq=3)

In [10]:
# 모델 생성

class LSTM(nn.Module):
    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(len(text_field.vocab), 300)   # 300크기로 임베딩
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=300,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True) # 양방향으로 설정

        self.drop = nn.Dropout(p=0.5)
        self.fc = nn.Linear(2*dimension, 1) # full connect를 위해 1개의 차원으로 펼치기

    def forward(self, text, text_len):
        text_emb = self.embedding(text)
        # pack_padded_sequence = 패딩된 문장을 패딩 기준으로 정렬해주는 역할
        # 참고: https://simonjisu.github.io/nlp/2018/07/05/packedsequence.html
        packed_input = pack_padded_sequence(text_emb,
                                            text_len.cpu(),
                                            batch_first=True,
                                            enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text.fea)

        return text_out

In [9]:
# 학습 전 필요한 함수 생성

def save_checkpoint(save_path, model, optimizer, valid_loss):
    if save_path == None: return

    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}

    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model, optimizer):
    if load_path == None: return

    state_dict = torch.load(load_path, map_location=device)
    print(f'Model load from <== {load_path}')

    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])

    return state_dict['valid_loss']

def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):
    if save_path == None: return

    state_dict = {'train_loss_list':train_loss_list,
                  'valid_loss_list':valid_loss_list,
                  'global_steps_list':global_steps_list}

    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_metrics(load_path):
    if load_path == None: return

    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')

    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [10]:
# training 함수를 만들자

def train(
    model,
    optimizer,
    criterion = nn.BCELoss(), # binary case 인 경우에 사용하는 Loss로 softmax를 포함하지 않고 cross entropy만 구한다.
    train_loader = train_iter,
    valid_loader = valid_iter,
    num_epochs = 5,
    eval_every = len(train_iter) // 2,
    file_path = destination_folder,
    best_valid_loss = float('Inf')
    ):

    # running values 초기화
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list, valid_loss_list, global_step_list = [], [], []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, (title, title_len), (text, text_len), (titletext, titletext_len)), _ in train_loader:
            labels = labels.to(device)
            titletext = titletext.to(device)
            titletext_len = titletext_len.to(device)
            output = model(titletext, titletext_len)

            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():
                    # validation loop
                    for (labels, (title, title_len), (text, text_len), (titletext, titletext_len)), _ in valid_loader:
                        labels = labels.to(device)
                        titletext = titletext.to(device)
                        titletext_len = titletext_len.to(device)
                        output = model(titletext, titletext_len)

                        loss = criterion(output, labels)
                        valid_running_loss += loss.item()
                
                # evaluation
                avarage_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(avarage_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_step_list.append(global_step)

                # resetting running values
                running_loss = 0.0
                valid_running_loss = 0.0
                model.train()

                # print
                print('Epoch [{}/{}], Step [{}/{}], Train Loss {:.4f}, Valid Loss {:.4f}'\
                    .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader), avarage_train_loss, average_valid_loss))

                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
                    save_metrics(file_path, '/metrics.pt', train_loss_list, valid_loss_list, global_step_list)

    save_metrics(file_path, '/metrics.pt', train_loss_list, valid_loss_list, global_step_list)
    print('--Done--')

In [11]:
# model = LSTM().to(device)
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# train(model=model, optimizer=optimizer, num_epochs=1)