In [1]:
from data_process import tokenizer_process, build_vocab, sentence_to_onehot, cal_idf, sentence_to_tfidf, sentence_to_index
from data_process import pos_process, build_vocab_pos, sentence_to_onehot_pos, cal_idf_pos, sentence_to_tfidf_pos, sentence_to_index_pos
from data_process import morphs_process, build_vocab_morphs, sentence_to_onehot_morphs, cal_idf_morphs, sentence_to_tfidf_morphs, sentence_to_index_morphs
from pprint import pprint

# Data Process with simple tokenizer

In [2]:
data = ['나는 생각한다 고로 나는 존재한다.',
        '모든 국가는 그에 걸맞는 국가를 가진다.',
        '이것 또한 지나가리라',
        '죄는 미워하되 사람은 미워하지 마라.',
        '일찍 일어나는 새가 벌레를 잡는다']

In [3]:
pprint(tokenizer_process(data))

[['나는', '생각한다', '고로', '나는', '존재한다', '.'],
 ['모든', '국가는', '그에', '걸맞는', '국가를', '가진다', '.'],
 ['이것', '또한', '지나가리라'],
 ['죄는', '미워하되', '사람은', '미워하지', '마라', '.'],
 ['일찍', '일어나는', '새가', '벌레를', '잡는다']]


In [4]:
vocab, _, vocab_size = build_vocab(data)

In [5]:
pprint(vocab.keys())

dict_keys(['<PAD>', '<UNK>', '.', '나는', '생각한다', '고로', '존재한다', '모든', '국가는', '그에', '걸맞는', '국가를', '가진다', '이것', '또한', '지나가리라', '죄는', '미워하되', '사람은', '미워하지', '마라', '일찍', '일어나는', '새가', '벌레를', '잡는다'])


In [6]:
pprint(sentence_to_onehot(data, vocab))

array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
        0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1]])


In [7]:
IDF = cal_idf(data, vocab)

In [8]:
pprint(sentence_to_tfidf(data, vocab, IDF))

array([[0.        , 0.        , 0.16735766, 0.91629073, 0.68721805,
        0.68721805, 0.68721805, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.22314355, 0.        , 0.        ,
        0.        , 0.        , 0.91629073, 0.91629073, 0.91629073,
        0.91629073, 0.91629073, 0.91629073, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.91629073, 0.91629073,
        0.91629073, 0.        , 0.        , 0.        , 0.        ,
      

In [9]:
pprint(sentence_to_index(data, vocab))

[[3, 4, 5, 3, 6, 2, 0],
 [7, 8, 9, 10, 11, 12, 2],
 [13, 14, 15, 0, 0, 0, 0],
 [16, 17, 18, 19, 20, 2, 0],
 [21, 22, 23, 24, 25, 0, 0]]


In [10]:
pprint(sentence_to_index(data,vocab, 10))

[[3, 4, 5, 3, 6, 2, 0, 0, 0, 0],
 [7, 8, 9, 10, 11, 12, 2, 0, 0, 0],
 [13, 14, 15, 0, 0, 0, 0, 0, 0, 0],
 [16, 17, 18, 19, 20, 2, 0, 0, 0, 0],
 [21, 22, 23, 24, 25, 0, 0, 0, 0, 0]]


# Data Process with Twitter pos(part of speech) extractor

In [11]:
pprint(pos_process(data))

[['나', '생각', '하다', '고로', '나', '존재', '하다'],
 ['모든', '국가', '그', '걸', '맞다', '국가', '가지다'],
 ['것', '또한', '지나가다'],
 ['죄', '미워하다', '사람', '미워하다', '마르다'],
 ['일찍', '일어나다', '새', '벌레', '잡다']]


In [12]:
vocab, _, vocab_size = build_vocab_pos(data)

In [13]:
pprint(vocab.keys())

dict_keys(['<PAD>', '<UNK>', '나', '하다', '국가', '미워하다', '생각', '고로', '존재', '모든', '그', '걸', '맞다', '가지다', '것', '또한', '지나가다', '죄', '사람', '마르다', '일찍', '일어나다', '새', '벌레', '잡다'])


In [14]:
pprint(sentence_to_onehot_pos(data, vocab))

array([[0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 1]])


In [15]:
IDF = cal_idf_pos(data, vocab)

In [16]:
pprint(sentence_to_tfidf_pos(data, vocab, IDF))

array([[0.        , 0.        , 0.91629073, 0.91629073, 0.        ,
        0.        , 0.68721805, 0.68721805, 0.68721805, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.91629073,
        0.        , 0.        , 0.        , 0.        , 0.68721805,
        0.68721805, 0.68721805, 0.68721805, 0.68721805, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.91629073,
        0.91629073, 0.91629073, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.

In [17]:
pprint(sentence_to_index_pos(data, vocab))

[[2, 6, 3, 7, 2, 8, 3],
 [9, 4, 10, 11, 12, 4, 13],
 [14, 15, 16, 0, 0, 0, 0],
 [17, 5, 18, 5, 19, 0, 0],
 [20, 21, 22, 23, 24, 0, 0]]


In [18]:
pprint(sentence_to_index_pos(data,vocab, 10))

[[2, 6, 3, 7, 2, 8, 3, 0, 0, 0],
 [9, 4, 10, 11, 12, 4, 13, 0, 0, 0],
 [14, 15, 16, 0, 0, 0, 0, 0, 0, 0],
 [17, 5, 18, 5, 19, 0, 0, 0, 0, 0],
 [20, 21, 22, 23, 24, 0, 0, 0, 0, 0]]


# Data Process with Twitter morphs extractor

In [19]:
pprint(morphs_process(data))

[['나', '는', '생각한', '다', '고로', '나', '는', '존재한', '다', '.'],
 ['모든', '국가', '는', '그', '에', '걸', '맞는', '국가', '를', '가진', '다', '.'],
 ['이', '것', '또한', '지나가', '리라'],
 ['죄', '는', '미워하', '되', '사람', '은', '미워하지', '마라', '.'],
 ['일찍', '일어나는', '새', '가', '벌레', '를', '잡는', '다']]


In [20]:
vocab, _, vocab_size = build_vocab_morphs(data)

In [21]:
pprint(vocab.keys())

dict_keys(['<PAD>', '<UNK>', '는', '다', '.', '나', '국가', '를', '생각한', '고로', '존재한', '모든', '그', '에', '걸', '맞는', '가진', '이', '것', '또한', '지나가', '리라', '죄', '미워하', '되', '사람', '은', '미워하지', '마라', '일찍', '일어나는', '새', '가', '벌레', '잡는'])


In [22]:
pprint(sentence_to_onehot_morphs(data, vocab))

array([[0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])


In [23]:
IDF = cal_idf_morphs(data, vocab)

In [24]:
pprint(sentence_to_tfidf_morphs(data, vocab, IDF))

array([[0.        , 0.        , 0.22314355, 0.22314355, 0.16735766,
        0.91629073, 0.        , 0.        , 0.68721805, 0.68721805,
        0.68721805, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.16735766, 0.16735766, 0.16735766,
        0.        , 0.91629073, 0.38311922, 0.        , 0.        ,
        0.        , 0.68721805, 0.68721805, 0.68721805, 0.68721805,
        0.68721805, 0.68721805, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.

In [25]:
pprint(sentence_to_index_morphs(data, vocab))

[[5, 2, 8, 3, 9, 5, 2, 10, 3, 4, 0, 0],
 [11, 6, 2, 12, 13, 14, 15, 6, 7, 16, 3, 4],
 [17, 18, 19, 20, 21, 0, 0, 0, 0, 0, 0, 0],
 [22, 2, 23, 24, 25, 26, 27, 28, 4, 0, 0, 0],
 [29, 30, 31, 32, 33, 7, 34, 3, 0, 0, 0, 0]]


In [26]:
pprint(sentence_to_index_morphs(data,vocab, 10))

[[5, 2, 8, 3, 9, 5, 2, 10, 3, 4],
 [11, 6, 2, 12, 13, 14, 15, 6, 7, 16],
 [17, 18, 19, 20, 21, 0, 0, 0, 0, 0],
 [22, 2, 23, 24, 25, 26, 27, 28, 4, 0],
 [29, 30, 31, 32, 33, 7, 34, 3, 0, 0]]
