In [1]:
import numpy as np
import pandas as pd

In [2]:
data_path = 'data/'
train_df = pd.read_json(data_path + 'train.json')
test_df = pd.read_json(data_path + 'test.json')

In [3]:
def merge_title_text(df):
    
    text_list = []
    for title, text in zip(df['title'], df['text']):
        merged = title + ' ' + text
        text_list.append(merged)
        
    df = df.assign(merged=text_list)
    
    return df

In [4]:
train_df_merged = merge_title_text(train_df)
test_df_merged = merge_title_text(test_df)

In [5]:
train_list = list(train_df_merged['merged'])
train_rating = list(train_df_merged['rating'])
train_rating = [rating - 1 for rating in train_rating]

test_list = list(test_df_merged['merged'])

In [6]:
from sklearn.model_selection import train_test_split 
X_train, X_val, y_train, y_val = train_test_split(train_list, train_rating, test_size=0.1, random_state=31)

In [7]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from collections import Counter

In [8]:
def encode(text, word2index, N):
    # text: sentences
    # word2index: dict of words and coresponding indices
    # label: label of emotion
    # N: all data should be padded to length N
    tokenized = word_tokenize(text)
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    
    return encoded

In [9]:
counts = Counter()
for text in X_train:
    counts.update(word_tokenize(text))
    
for text in X_val:
    counts.update(word_tokenize(text))

word2index = {'unk': 0}
for i, word in enumerate(counts.keys()):
    word2index[word] = i+1


train_encoded = [(encode(X_train[i], word2index, 12)) for i in range(len(y_train))]
val_encoded = [(encode(X_val[i], word2index, 12)) for i in range(len(y_val))]

train_x = np.array(train_encoded)
train_y = np.array(y_train)
val_x = np.array(val_encoded)
val_y = np.array(y_val)

batch_size = 32

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
val_ds = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))

train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
val_dl = DataLoader(val_ds, shuffle=True, batch_size=batch_size, drop_last=True)

In [10]:
# Set hyper parameters
src_vocab_size = len(word2index)
dimension_model = 32
num_layers = 5
hidden_size = 30
linear_hidden_size = 10
classes = 5
dropout = 0.2
lr = 1e-3

# Define properties and functions for our LSTM model
class LSTM(torch.nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.embed = torch.nn.Embedding(src_vocab_size, dimension_model)
        self.lstm = torch.nn.LSTM(input_size=dimension_model, hidden_size=hidden_size,
                                  num_layers=num_layers, dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, linear_hidden_size)
        self.linear1 = torch.nn.Linear(linear_hidden_size, classes)

    def forward(self, data):
        x = self.embed(data)
        x, (h_n, c_n) = self.lstm(x.transpose(0, 1))

        x = self.linear(x[-1])
        x = self.linear1(x)

        return x

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    for batch in iterator:
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())

        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [12]:
model = LSTM().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

best_acc = 0
best_path = ''

for epoch in range(10):
    train_loss, train_acc = train(model, train_dl, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, val_dl, criterion)

    print(f'Epoch: {epoch+1:02}, ')
    print(f'Train Loss: {train_loss:.3f},Train Acc: {train_acc * 100:.2f}%,')
    print(f'Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%\n')
    
    if best_acc < valid_acc:
        best_acc = valid_acc
        best_path = f"epoch{epoch+1}_val.accuracy{valid_acc*100:.1f}%.pt"
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': valid_loss,
        }, best_path)

Epoch: 01, 
Train Loss: 1.447,Train Acc: 31.61%,
Val. Loss: 1.276, Val. Acc: 42.89%

Epoch: 02, 
Train Loss: 1.193,Train Acc: 47.44%,
Val. Loss: 1.161, Val. Acc: 47.31%

Epoch: 03, 
Train Loss: 1.091,Train Acc: 51.29%,
Val. Loss: 1.163, Val. Acc: 49.54%

Epoch: 04, 
Train Loss: 1.038,Train Acc: 53.91%,
Val. Loss: 1.118, Val. Acc: 49.83%

Epoch: 05, 
Train Loss: 0.988,Train Acc: 56.43%,
Val. Loss: 1.111, Val. Acc: 49.31%

Epoch: 06, 
Train Loss: 0.946,Train Acc: 58.71%,
Val. Loss: 1.124, Val. Acc: 50.75%

Epoch: 07, 
Train Loss: 0.908,Train Acc: 60.76%,
Val. Loss: 1.155, Val. Acc: 50.34%

Epoch: 08, 
Train Loss: 0.870,Train Acc: 63.00%,
Val. Loss: 1.172, Val. Acc: 50.80%

Epoch: 09, 
Train Loss: 0.833,Train Acc: 64.99%,
Val. Loss: 1.221, Val. Acc: 50.29%

Epoch: 10, 
Train Loss: 0.799,Train Acc: 67.04%,
Val. Loss: 1.252, Val. Acc: 50.77%



In [13]:
def encode_test(text, word2index, N):
    
    tokenized = word_tokenize(text)
    for i, word in enumerate(tokenized):
        if word2index.get(word) == None:
            tokenized[i] = 'unk'

    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]

    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    
    return encoded

In [17]:
test_encoded = [(encode_test(text, word2index, 10)) for text in test_list]

test_x = np.array(test_encoded)
test_ds = TensorDataset(torch.from_numpy(test_x))
test_dl = DataLoader(test_ds, shuffle=False)

In [20]:
model = LSTM().to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
checkpoint = torch.load(best_path)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
model.eval()

predict = []
for i, data in enumerate(test_dl):
    text = data[0].to(device)
    preds = model(text)
    _, pred = torch.max(preds, 1)
    predict.append(pred.item() + 1)
    if i % 100 == 0:
    	print(len(predict))

indices = range(len(predict))
indices = ['index_' + str(i) for i in indices]
data = {"index": indices ,"rating": predict}
out_df = pd.DataFrame(data, columns=["index", "rating"])
out_df.to_csv('result.csv', index=False, header=True)

1
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1601
1701
1801
1901
2001
2101
2201
2301
2401
2501
2601
2701
2801
2901
3001
3101
3201
3301
3401
3501
3601
3701
3801
3901
4001
4101
4201
4301
4401
4501
4601
4701
4801
4901
5001
5101
5201
5301
5401
5501
5601
5701
5801
5901
6001
6101
6201
6301
6401
6501
6601
6701
6801
6901
7001
7101
7201
7301
7401
7501
7601
7701
7801
7901
8001
8101
8201
8301
8401
8501
8601
8701
8801
8901
9001
9101
9201
9301
9401
9501
9601
9701
9801
9901
10001
10101
10201
10301
10401
10501
10601
10701
10801
10901
11001
11101
11201
11301
11401
11501
11601
11701
11801
11901
12001
12101
12201
12301
12401
12501
12601
12701
12801
12901
13001
13101
13201
13301
13401
13501
13601
13701
13801
13901
14001
14101
14201
14301
14401
14501
14601
14701
14801
14901
15001
15101
15201
15301
15401
15501
15601
15701
15801
15901
16001
16101
16201
16301
16401
16501
16601
16701
16801
16901
17001
17101
17201
17301
17401
17501
17601
17701
17801
17901
18001
18101
18201
18301
18401
18