In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

In [2]:
chats_data = pd.read_csv('../preprocessing/preprocessing.csv')

In [3]:
chats_data.head()

Unnamed: 0,messages,sentiment
0,"['selamat', 'pagi', 'mohon', 'maaf', 'ganggu',...",1
1,"['whatsapp', 'alaykumsalam', 'via', 'whatsapp'...",1
2,"['perihal', 'bimbing', 'tentu', 'jam']",1
3,"['kamis', 'jumat']",1
4,"['laksana', 'p0', 'tentu', 'bahas', 'bimbing',...",1


#### 1. convert messages into list

In [6]:
messages_list = list()

for message in chats_data['messages']:
    message = message.replace('[', '').replace('\'', '').replace(']', '')
    message_list = message.split(', ')
    messages_list.append(message_list)

In [7]:
print(len(messages_list))
print(messages_list[:5])

1004
[['selamat', 'pagi', 'mohon', 'maaf', 'ganggu', 'mahasiswa', 'teknik', 'informatika', 'angkat', 'informasi', 'bimbing', 'skripsi', 'dosen', 'bimbing', 'perihal', 'surat', 'tugas', 'terbit', 'tanggal', 'oktober', 'komunikasi', 'kait', 'kerja', 'skripsi', 'media', 'iya', 'mohon', 'maaf', 'terima', 'kasih', 'wassalamualaikum'], ['whatsapp', 'alaykumsalam', 'via', 'whatsapp', 'iya'], ['perihal', 'bimbing', 'tentu', 'jam'], ['kamis', 'jumat'], ['laksana', 'p0', 'tentu', 'bahas', 'bimbing', 'iya']]


#### 2. make token set list from dataset

In [8]:
token_set_list = list()

if messages_list:
    for message in messages_list:
        for word in message:
            if word not in token_set_list:
                token_set_list.append(word)

In [9]:
print(len(token_set_list))
print(token_set_list[:4])

1299
['selamat', 'pagi', 'mohon', 'maaf']


#### 3. convert message word to int

In [10]:
messages_list_int = list()

if token_set_list:
    for message_list in messages_list:
        new_message_list = list()
        for message in message_list:
            new_message_list.append(token_set_list.index(message)+1)
        messages_list_int.append(new_message_list)

In [11]:
print(len(messages_list_int))
print(messages_list_int[:5])

1004
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 11, 14, 15, 16, 17, 18, 19, 20, 21, 22, 12, 23, 24, 3, 4, 25, 26, 27], [28, 29, 30, 28, 24], [14, 11, 31, 32], [33, 34], [35, 36, 31, 37, 11, 24]]


#### 4. padding messages list int (so each list have same size)

In [12]:
messages_list_int_pad = list()
list_size = len(token_set_list)

if messages_list_int:
    for message_list in messages_list_int:
        new_message_list = list()
        size_diff = list_size - len(message_list)
        for each in range(size_diff):
            new_message_list.append(0)
        new_message_list += message_list
        messages_list_int_pad.append(new_message_list)

In [13]:
print(len(messages_list_int_pad))
print(messages_list_int_pad[-1][1200:-1])

1004
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 21, 12, 95, 119, 58, 12, 79, 58, 12, 25, 26, 164]


#### change negative sentiment labels from -1 to 0

In [14]:
for i, row in chats_data.iterrows():
    sentiment = row['sentiment']
    
    if sentiment == -1:
        #replace current message value in df
        chats_data.at[i, 'sentiment'] = 0

#### 5. train, valid, and test dataset split

In [15]:
data_size = chats_data.shape[0]
split_perc = 0.8
split_lim = int(split_perc * data_size)

train_data = messages_list_int_pad[0:split_lim]
train_labels = chats_data['sentiment'][0:split_lim]

remain_data = messages_list_int_pad[split_lim:]
remain_labels = chats_data['sentiment'][split_lim:]

remain_data_size = len(remain_data)
split_lim = int(0.5 * remain_data_size)

valid_data = remain_data[0:split_lim]
valid_labels = remain_labels[0:split_lim]

test_data = remain_data[split_lim:]
test_labels = remain_labels[split_lim:]

In [16]:
print(
    f'train data size: {len(train_data)}, labels size: {len(train_labels)}',
    f'valid data size: {len(valid_data)}, labels size: {len(valid_labels)}',
    f'test data size: {len(test_data)}, labels size: {len(test_labels)}',
    f'total: {len(train_data) + len(valid_data) + len(test_data)}',
    sep='\n'
)

train data size: 803, labels size: 803
valid data size: 100, labels size: 100
test data size: 101, labels size: 101
total: 1004


#### 6. create torch dataloader

In [17]:
train_tensor_data = TensorDataset(
    torch.ShortTensor(train_data),
    torch.from_numpy(train_labels.values)
)
valid_tensor_data = TensorDataset(
    torch.ShortTensor(valid_data),
    torch.from_numpy(valid_labels.values)
)
test_tensor_data = TensorDataset(
    torch.ShortTensor(test_data),
    torch.from_numpy(test_labels.values)
)

train_loader = DataLoader(train_tensor_data, shuffle=True)
valid_loader = DataLoader(valid_tensor_data, shuffle=True)
test_loader = DataLoader(test_tensor_data, shuffle=True)

In [18]:
data_iter = iter(train_loader)
sample_x, sample_y = data_iter.next()

print(
    f'sample input size: {sample_x.size()}',
    'sample input:',
    f'{sample_x}\n',
    f'sample label size: {sample_y.size()}',
    'sample label:',
    sample_y,
    sep='\n'
)

sample input size: torch.Size([1, 1299])
sample input:
tensor([[  0,   0,   0,  ..., 444,  13, 138]], dtype=torch.int16)

sample label size: torch.Size([1])
sample label:
tensor([0])


In [39]:
class SentimentLSTM(nn.Module):
    def __init__(
        self, 
        embed_dim,
        hidden_dim,
        vocab_size,
        target_size
    ):
        super().__init__()
        
        self.lstm = nn.LSTM(embed_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, target_size)
        self.sig = nn.Sigmoid()
    
    def forward(self, session_array):
        #session array ex: [[1, 2, 4, ..], [90, 12, 91, ..]]
        #make session array as 3 dim (use unsqueeze)
        session_array = session_array.unsqueeze(1)
        
        lstm_out, _ = self.lstm(session_array)
        
        #squeeze lstm_out
        lstm_out = lstm_out.squeeze()
        
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = self.sig(tag_space)
        print(tag_scores)

In [42]:
test = messages_list_int_pad[0:5]
test = torch.ShortTensor(test)
print(test)

tensor([[ 0,  0,  0,  ..., 25, 26, 27],
        [ 0,  0,  0,  ..., 30, 28, 24],
        [ 0,  0,  0,  ..., 11, 31, 32],
        [ 0,  0,  0,  ...,  0, 33, 34],
        [ 0,  0,  0,  ..., 37, 11, 24]], dtype=torch.int16)


In [None]:
model = SentimentLSTM(
    embed_dim=6,
    hidden_dim=
)