# 불러오기

In [1]:
from collections import defaultdict
import numpy as np


def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls


In [2]:
# 데이터 불러오기
x_train, y_train = read_txt('../ratings_train.txt')
x_test, y_test = read_txt('../ratings_test.txt')

# 비어있는 리뷰 제거

In [3]:
def remove_empty_review(X, Y):
    empty_idx_ls = []
    
    for idx, review in enumerate(X):
        if len(review) == 0:
            empty_idx_ls.append(idx)
    
    # idx 값이 큰 것부터 제거 (앞으로 밀리는 것을 방지)
    empty_idx_ls = sorted(empty_idx_ls, reverse = True)
    
    for empty_idx in empty_idx_ls:
        del X[empty_idx], Y[empty_idx]
    
    return X, Y

In [4]:
x_train, y_train = remove_empty_review(x_train, y_train)
x_test, y_test = remove_empty_review(x_test, y_test)

# embedding with FastText

In [5]:
from gensim.models import FastText
embed_size = 100

x_total = x_train + x_test
x_total = [review.split() for review in x_total]

embedding_model = FastText(
    x_total, 
    size=embed_size,
    window=5, 
    min_count=1,
    min_n=2,
    workers=4, 
    sg=1, 
    )

In [8]:
embedding_model.wv.most_similar('')

MemoryError: 

# CBOW

In [7]:
X_train = []
for sent in x_train:
    temp_cbow = [embedding_model.wv.__getitem__(token) if embedding_model.wv.__contains__(token) else np.zeros((1,embed_size)) for token in sent]
    X_train.append(np.sum(temp_cbow, axis = 0))

In [8]:
X_test = []
for sent in x_test:
    temp_cbow = [embedding_model.wv.__getitem__(token) if embedding_model.wv.__contains__(token) else np.zeros((1,embed_size)) for token in sent]
    X_test.append(np.sum(temp_cbow, axis=0))

# Y 더미화

from collections import defaultdict

l2i_dict = defaultdict(lambda : len(l2i_dict))

y_train = [l2i_dict[y] for y in y_train]
y_test = [l2i_dict[y] for y in y_test]

## **Logistic Regression**

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [10]:
clf = LogisticRegression(solver = 'sag',
                         multi_class = 'multinomial')


clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('Accuracy : ', accuracy_score(y_pred, y_test))

Accuracy :  0.7477648658919536


# MLP 

In [11]:
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F

## 전처리 

In [12]:
# torch Variable로 변환
def convert_to_long_variable(w2i_ls):
    
    var = Variable(torch.LongTensor(w2i_ls))
    return var

In [13]:
# torch Variable로 변환
def convert_to_float_variable(w2i_ls):
    
    var = Variable(torch.FloatTensor(w2i_ls))
    return var

In [14]:
X_train = convert_to_float_variable(X_train)
X_test = convert_to_float_variable(X_test)

y_train = convert_to_long_variable(y_train)
y_test = convert_to_long_variable(y_test)

In [15]:
class MLP(nn.Module):
    
    def __init__(self, embed_size, hid_size, dropout):
        super(MLP, self).__init__()
        
        self.embed_size = embed_size
        self.hid_size = hid_size
        self.dropout = dropout
        
        self.mlp = nn.Sequential(
            nn.Linear(embed_size, hid_size), nn.Tanh(), nn.Dropout(),
            nn.Linear(hid_size, hid_size), nn.Tanh(), nn.Dropout(),
            nn.Linear(hid_size, hid_size), nn.Tanh(), nn.Dropout(),
            nn.Linear(hid_size, hid_size), nn.Tanh(), nn.Dropout(),
            nn.Linear(hid_size, 1)
        )    
        return
    
    def forward(self, x):
        logit = self.mlp(x)
        sigmoid = torch.sigmoid(logit)
        return sigmoid

In [16]:
EMBED_SIZE = embed_size
HID_SIZE = 100
DROPOUT = 0.5


model = MLP(EMBED_SIZE, HID_SIZE, DROPOUT)

lr = 0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr)

In [17]:
model

MLP(
  (mlp): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=100, out_features=100, bias=True)
    (4): Tanh()
    (5): Dropout(p=0.5)
    (6): Linear(in_features=100, out_features=100, bias=True)
    (7): Tanh()
    (8): Dropout(p=0.5)
    (9): Linear(in_features=100, out_features=100, bias=True)
    (10): Tanh()
    (11): Dropout(p=0.5)
    (12): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [18]:
import random 

epochs = 25
train_size = X_train.size(0)
train_idx = np.arange(train_size)

val_size = 10000
val_idx = np.arange(val_size)

batch_size = 1000
loss_ls = []

for epoch in range(epochs):
    
    # 학습 데이터 셔플링
    random.shuffle(train_idx)
    X_train = X_train[train_idx]
    y_train = y_train[train_idx]
    
    train_loss = 0
    
    for start_idx, end_idx in zip(range(0, train_size, batch_size),
                                  range(batch_size, train_size +1, batch_size)):
        
        X_batch = X_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx]
        
        sigmoid = model(X_batch)
        predict = sigmoid.ge(0.5).squeeze(1).long()
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(sigmoid.squeeze(1), y_batch.float())
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch+1, train_loss / batch_size, acc))
    print('=================================================================================================')
    
    loss_ls.append(train_loss)
    
    if (epoch+1) % 5 == 0:
        model.eval()
        X_val = X_test[val_idx]
        y_val = y_test[val_idx]
        
        sigmoid = model(X_val)
        predict = sigmoid.ge(0.5).squeeze(1).long()
        
        acc = (predict == y_val).sum().item() / val_size
        loss = criterion(sigmoid.squeeze(1), y_val.float())

        print('*************************************************************************************************')
        print('*************************************************************************************************')
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch+1, loss.item(), acc))
        print('*************************************************************************************************')
        print('*************************************************************************************************')

    

Train epoch : 1,  loss : 0.08104757231473923,  accuracy :0.748
Train epoch : 2,  loss : 0.07757835146784782,  accuracy :0.741
Train epoch : 3,  loss : 0.07705132654309273,  accuracy :0.712
Train epoch : 4,  loss : 0.07629942080378532,  accuracy :0.773
Train epoch : 5,  loss : 0.07631950217485428,  accuracy :0.733
*************************************************************************************************
*************************************************************************************************
Test Epoch : 5, Test Loss : 0.509 , Test Accuracy : 0.744
*************************************************************************************************
*************************************************************************************************
Train epoch : 6,  loss : 0.07254600870609283,  accuracy :0.759
Train epoch : 7,  loss : 0.07138375705480575,  accuracy :0.780
Train epoch : 8,  loss : 0.07097809028625489,  accuracy :0.764
Train epoch : 9,  loss : 0.07041480082273484, 