In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('./data/MBTI_train.csv', header=None, encoding = 'ISO 8859-1')
test_data = pd.read_csv('./data/MBTI_test.csv', header =None,encoding = 'ISO 8859-1')

In [3]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = 1000) # 가장 빈도가 높은 1000개의 단어 
tokenizer.fit_on_texts(train_data.iloc[:,1]) # 단어 인덱스 구축

train_sequences = tokenizer.texts_to_sequences(train_data.iloc[:,1]) # 문자열을 정수 인덱스의 리스트로 변환
test_sequences = tokenizer.texts_to_sequences(test_data.iloc[:,0])

In [4]:
from keras.layers import Embedding
from keras import preprocessing

In [5]:
maxlen = 3000

X_train = preprocessing.sequence.pad_sequences(train_sequences, maxlen = maxlen)
X_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen = maxlen)

In [6]:
y_train = train_data.iloc[:,0]
y_train = y_train.replace({'INTP':0, 'INFJ':1, 'INTJ':2, 'ENTJ':3, 'ENTP':4, 'INFP':5,
                 'ISTP':6, 'ISFJ':7, 'ENFP':8, 'ISFP':9, 'ISTJ':10, 'ENFJ':11,
                 'ESTP':12, 'ESFP':13, 'ESTJ':14, 'ESFJ':15})

In [7]:
def to_one_hot(labels, dimension = 16):
    results = np.zeros((len(labels), dimension))
    for i, label in enumerate(labels):
        results[i, label] = 1.
    return results

y_train_label = to_one_hot(y_train)

In [8]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

final_model = Sequential()
final_model.add(Embedding(1000, 8, input_length = maxlen))

final_model.add(Flatten()) # (samples, maxlen * 8) 크기의 2D 텐서로 펼친다.

final_model.add(Dense(16, activation = 'sigmoid'))
final_model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [9]:
history = final_model.fit(X_train, y_train_label, epochs =4, batch_size = 32, validation_split = 0.2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [10]:
from keras import models

layer_outputs = [layer.output for layer in final_model.layers]

activation_model = models.Model(inputs = final_model.input, outputs = layer_outputs)

In [11]:
train_activations = activation_model.predict(X_train)

In [12]:
test_activations = activation_model.predict(X_test)

In [15]:
X_train_embedding_origin = pd.DataFrame(train_activations[1])
test_data_embedding_origin = pd.DataFrame(test_activations[1])

In [16]:
train_data_embedding_origin = pd.concat([train_data.iloc[:,0], X_train_embedding_origin], axis = 1)

In [None]:
train_data_embedding_origin.to_csv('./preproc/train_data_embedding_origin.csv', index = None)
test_data_embedding_origin.to_csv('./preproc/test_data_embedding_origin.csv', index = None)

**주성분 분석**

In [None]:
pca = PCA(n_components=150)
pca.fit(X_train_embedding_origin)
#print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum()) #