import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
STOPWORDS = set([]) # set(stopwords.words('english'))
def clean_text(text):
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = ' '.join([word for word in text.split() if word not in STOPWORDS])
return text
def tokenize(text, word_to_idx):
tokens = []
for word in text.split():
return tokens
def pad_and_truncate(messages, max_length=30):
features = np.zeros((len(messages), max_length), dtype=int)
for i, sms in enumerate(messages):
if len(sms):
features[i, -len(sms):] = sms[:max_length]
return features
if __name__ == '__main__':
data = pd.read_csv('./data/SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'sms'])
data.sms = data.sms.apply(clean_text)
words = set((' '.join(data.sms)).split())
word_to_idx = {word: i for i, word in enumerate(words, 1)}
tokens = data.sms.apply(lambda x: tokenize(x, word_to_idx))
inputs = pad_and_truncate(tokens)
labels = np.array((data.label == 'spam').astype(int))'./data/labels.npy', labels)'./data/inputs.npy', inputs)
