In [53]:
import os
import re
import io
import requests
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from zipfile import ZipFile

data_dir='./data'
data_file='text_data.txt'
seed=0

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.isfile(os.path.join(data_dir, data_file)):
    zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    r = requests.get(zip_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('SMSSpamCollection')
    text_data = file.decode()
    text_data = text_data.encode('ascii', errors='ignore')
    text_data = text_data.decode().split('\n')

    with open(os.path.join(data_dir, data_file), 'w') as file_conn:
        for text in text_data:
            file_conn.write("{}\n".format(text))
else:
    text_data = []
    with open(os.path.join(data_dir, data_file), 'r') as file_conn:
        for row in file_conn:
            text_data.append(row)
    text_data = text_data[:-1]

text_data = [x.split('\t') for x in text_data if len(x) >= 1]
[text_data_target, text_data_train] = [list(x) for x in zip(*text_data)]

In [54]:
text_string = text_data_train[:10]

In [55]:
text_string

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n',
 'Ok lar... Joking wif u oni...\n',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n",
 'U dun say so early hor... U c already then say...\n',
 "Nah I don't think he goes to usf, he lives around here though\n",
 "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, 1.50 to rcv\n",
 'Even my brother is not like to speak with me. They treat me like aids patent.\n',
 "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\n",
 'WINNER!! As a valued network customer you have been selected to receivea 900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours onl

In [71]:
'''
\s : 스페이스, 탭, 폼피드, 줄 바꿈 문자 등을 포함한 하나의 공백 문자 
\w : 밑줄 문자를 포함한 영숫자 문자 
[0-9] : 숫자
'''
for i in range(10):
    text_string[i] = re.sub(r'([^\s \w]|_|[0-9])+', '', text_string[i])

In [72]:
text_string

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat\n',
 'ok lar joking wif u oni\n',
 'free entry in  a wkly comp to win fa cup final tkts st may  text fa to  to receive entry questionstd txt ratetcs apply overs\n',
 'u dun say so early hor u c already then say\n',
 'nah i dont think he goes to usf he lives around here though\n',
 'freemsg hey there darling its been  weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send  to rcv\n',
 'even my brother is not like to speak with me they treat me like aids patent\n',
 'as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press  to copy your friends callertune\n',
 'winner as a valued network customer you have been selected to receivea  prize reward to claim call  claim code kl valid  hours only\n',
 'had your mobile  months or more u r entitled to update to the latest colour mobiles with ca

In [73]:
for i in range(10):
    text_string[i] = text_string[i].lower()

In [74]:
text_string

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat\n',
 'ok lar joking wif u oni\n',
 'free entry in  a wkly comp to win fa cup final tkts st may  text fa to  to receive entry questionstd txt ratetcs apply overs\n',
 'u dun say so early hor u c already then say\n',
 'nah i dont think he goes to usf he lives around here though\n',
 'freemsg hey there darling its been  weeks now and no word back id like some fun you up for it still tb ok xxx std chgs to send  to rcv\n',
 'even my brother is not like to speak with me they treat me like aids patent\n',
 'as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press  to copy your friends callertune\n',
 'winner as a valued network customer you have been selected to receivea  prize reward to claim call  claim code kl valid  hours only\n',
 'had your mobile  months or more u r entitled to update to the latest colour mobiles with ca

In [75]:
max_sequence_length = 10
min_word_frequency = 10
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
    max_sequence_length, min_frequency=min_word_frequency)

In [76]:
text_processed = np.array(list(vocab_processor.fit_transform(text_data_train)))
text_processed = np.array(text_processed)

In [77]:
np.shape(text_processed)

(5574, 10)

In [78]:
text_processed

array([[824, 518,   0, ...,   7,   0,  81],
       [ 80, 365,   0, ...,   0,   0,   0],
       [227, 527,   7, ...,   1, 351,   0],
       ...,
       [  0,  57,   7, ..., 107, 278,   0],
       [129, 571, 196, ...,   0,  52,   0],
       [  0, 168, 635, ...,   0,   0,   0]], dtype=int64)