Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
42 lines (31 sloc) 1.21 KB
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
STOPWORDS = set([]) # set(stopwords.words('english'))
def clean_text(text):
text = text.lower()
text = re.sub(r'[^a-z\s]', '', text)
text = ' '.join([word for word in text.split() if word not in STOPWORDS])
return text
def tokenize(text, word_to_idx):
tokens = []
for word in text.split():
tokens.append(word_to_idx[word])
return tokens
def pad_and_truncate(messages, max_length=30):
features = np.zeros((len(messages), max_length), dtype=int)
for i, sms in enumerate(messages):
if len(sms):
features[i, -len(sms):] = sms[:max_length]
return features
if __name__ == '__main__':
data = pd.read_csv('./data/SMSSpamCollection.txt', sep='\t', header=None, names=['label', 'sms'])
data.sms = data.sms.apply(clean_text)
words = set((' '.join(data.sms)).split())
word_to_idx = {word: i for i, word in enumerate(words, 1)}
tokens = data.sms.apply(lambda x: tokenize(x, word_to_idx))
inputs = pad_and_truncate(tokens)
labels = np.array((data.label == 'spam').astype(int))
np.save('./data/labels.npy', labels)
np.save('./data/inputs.npy', inputs)
You can’t perform that action at this time.