In [2]:
import pandas as pd
import numpy as np

from eunjeon import Mecab

import torch
import torch.nn.functional as F
import torchtext.transforms as T

from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.optim import AdamW

from torch.utils.data import Dataset
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torchtext.nn import MultiheadAttentionContainer, InProjContainer, ScaledDotProduct
from torch.nn.utils.rnn import pad_sequence
from torchtext.data import get_tokenizer
from torchtext.vocab import vocab, Vocab
from torchtext.transforms import VocabTransform, ToTensor

from sklearn.model_selection import train_test_split

import os

import sentencepiece as spm


# Dataset

In [3]:
dataset = pd.read_csv(os.path.join('..','mbti_y', 'mbti_clean.csv'))

In [4]:
dataset

Unnamed: 0,type,posts
0,INFJ,moments sportscenter top ten plays pranks lif...
1,ENTP,finding lack posts alarming sex boring positio...
2,INTP,good one course say know blessing curse absolu...
3,INTJ,dear enjoyed conversation day esoteric gabbin...
4,ENTJ,fired another silly misconception approaching ...
...,...,...
8670,ISFP,always think cats fi doms reason websites beco...
8671,ENFP,thread already exists someplace else heck dele...
8672,INTP,many questions things would take purple pill p...
8673,INFP,conflicted right comes wanting children honest...


In [5]:
dataset_shuffled = dataset.sample(frac=1)

In [6]:
dataset_shuffled

Unnamed: 0,type,posts
5679,INTJ,odds wrong accept wrong wrong likely wrong rem...
7843,ENTJ,anyone taking nootropics take affect cognitive...
5640,INFJ,op use psychotropic medication like antidepres...
1912,INFJ,hmmm happy family life children love change cr...
3935,ENFP,well typing celebrities complicated really see...
...,...,...
8501,INTJ,missed good one many babies take paint wall de...
1858,INFJ,inregardstomyself oh thinking thing actually m...
2376,INFP,say asl die stranger going thank stranger curi...
803,INFJ,hardly ever come onto site anymore say rationa...


In [7]:
x = dataset['posts']
y = dataset['type']

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.4, shuffle=True, stratify=y, random_state=12)

x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, shuffle=True, stratify=y_temp, random_state=12)

In [8]:
print(x_train.shape)
print(y_train.shape)
print()

print(x_val.shape)
print(y_val.shape)
print()

print(x_test.shape)
print(y_test.shape)
print()


(5205,)
(5205,)

(1735,)
(1735,)

(1735,)
(1735,)



# Text(input) Preprocessing

### tokenization

### vocab

In [9]:

with open('mbti_text.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(x_train))

spm.SentencePieceTrainer.Train('--input=mbti_text.txt --model_prefix=mbti_tokens --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

sp = spm.SentencePieceProcessor()

vocab_file = "mbti_tokens.model"

sp.load(vocab_file)

vocab_list = pd.read_csv('mbti_tokens.vocab', sep='\t', header=None)
vocab_list[:10]

Unnamed: 0,0,1
0,<unk>,0
1,<s>,0
2,</s>,0
3,in,0
4,▁s,-1
5,er,-2
6,▁t,-3
7,on,-4
8,re,-5
9,ing,-6


In [11]:
def encode(x):
    return sp.encode_as_ids(x)

def tokenize(df_x):
    return df_x.apply(encode)

In [12]:
def record2tensor(x):
    return torch.Tensor(x)

def df2tensor(df_x):
    return df_x.apply(record2tensor)

In [11]:
pd.DataFrame(pad_sequence(df2tensor(tokenize(x_train)), batch_first=True, padding_value=0))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3569,3570,3571,3572,3573,3574,3575,3576,3577,3578
0,377.0,4958.0,4943.0,202.0,390.0,2549.0,88.0,81.0,959.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,377.0,3895.0,234.0,20.0,1771.0,4945.0,1139.0,45.0,4946.0,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,377.0,3464.0,4.0,948.0,2287.0,246.0,263.0,3116.0,326.0,4620.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,377.0,4422.0,4969.0,4945.0,499.0,88.0,127.0,633.0,4946.0,4921.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1113.0,4945.0,327.0,8.0,1086.0,4946.0,4921.0,4930.0,8.0,137.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5200,377.0,2806.0,1564.0,88.0,21.0,1324.0,4930.0,2466.0,295.0,835.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5201,377.0,1464.0,2610.0,511.0,4945.0,353.0,108.0,685.0,4930.0,175.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5202,377.0,1620.0,4943.0,328.0,470.0,4962.0,417.0,144.0,46.0,263.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5203,715.0,63.0,4.0,2240.0,4926.0,277.0,433.0,4962.0,8.0,4946.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### label encoding

In [30]:
def to_int(x):
    types = ['INTJ', 'INTP', 'ENTJ', 'ENTP', 'INFJ', 'INFP', 'ENFJ', 'ENFP', 'ISTJ', 'ISFJ', 'ESTJ', 'ESFJ', 'ISTP', 'ISFP', 'ESTP', 'ESFP']
    integers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
    for i in range(len(integers)):
        if x == types[i]:
            result = integers[i] 
    return result

def label_encoding(df_y):
    return df_y.apply(to_int)

### preprocessing function

In [42]:
def preprocessing(df_x, df_y):

    # x
    encoded = pad_sequence(df2tensor(tokenize(df_x)), batch_first=True, padding_value=0)
    tensor_x = encoded.type(torch.int32)
    print(tensor_x.size())
    print(tensor_x.type())
    
    
    #y
    y = label_encoding(df_y)
    tensor_y = torch.Tensor(y.values)
    tensor_y = tensor_y.type(torch.long)
    print(tensor_y.size())
    print(tensor_y.type())
    
    return tensor_x, tensor_y
    
    

###  preprocessing

In [43]:
x_train, y_train = preprocessing(x_train, y_train)
x_val, y_val = preprocessing(x_val, y_val)

torch.Size([5205, 3579])
torch.IntTensor
torch.Size([5205])
torch.LongTensor
torch.Size([1735, 3128])
torch.IntTensor
torch.Size([1735])
torch.LongTensor


# Custom Dataset

In [13]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x_data = x
        self.y_data = y
        
    def __len__(self):
        return self.x_data.shpae(0)
    
    def __getitem__(self, idx):
        
        return self.x_data[idx], self.y_data[idx]

In [None]:
train_dataset = CustomDataset(x_train, y_train)
val_dataset = CustomDataset(x_val, y_val)

# DataLoader

In [15]:
batch_size = 100

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=False, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size = batch_size, shuffle=False, drop_last=True)