In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
import os
import glob
import re
import email

In [9]:
def cleaned(content):
    # First remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", content)
    # Then remove html comments. 
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"^$", "", cleaned)
    cleaned = re.sub("''|,", "", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", cleaned)
    cleaned = re.sub(r"\'s", " \'s", cleaned)
    cleaned = re.sub(r"\'ve", " \'ve", cleaned)
    cleaned = re.sub(r"n\'t", " n\'t", cleaned)
    cleaned = re.sub(r"\'re", " \'re", cleaned)
    cleaned = re.sub(r"\'d", " \'d", cleaned)
    cleaned = re.sub(r"\'ll", " \'ll", cleaned)
    cleaned = re.sub(r",", " , ", cleaned)
    cleaned = re.sub(r"!", " ! ", cleaned)
    cleaned = re.sub(r"\(", " \( ", cleaned)
    cleaned = re.sub(r"\)", " \) ", cleaned)
    cleaned = re.sub(r"\?", " \? ", cleaned)
    cleaned = re.sub(r"\s{2,}", " ", cleaned)
    cleaned = re.sub(r"\d+", "", cleaned)
    cleaned = re.sub(r"[\r\n]+", " ", cleaned)
    return cleaned.strip().lower()

    
def proccess_message(text):
    msg = email.message_from_string(text)
    '''To get the content from email objects'''
    parts = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            parts.append( part.get_payload() )
    content = ''.join(parts)
    return cleaned(content)

def load_from_path(path):
    data = []
    def proccess_dir(glob_re_path, label):
        res = []
        for file_path in glob.glob(glob_re_path):
            with open(file_path, 'r') as f:
                res.append([label, proccess_message(f.read())])
        return res
    data += proccess_dir(path+'/ham/*.txt', 0)
    data += proccess_dir(path+'/spam/*.txt', 1)
    return data

class BagOfWords(object):
    def __init__(self, tokenize=None, normalize=True):
        self._vocab = dict(UNK=0)
        self.normalize = normalize
        if tokenize == None:
            tokenize = lambda w: w.split(' ')
        self._tokenize = tokenize
    
    def fit(self, X, y=None):
        self._vocab.update(dict([(v, k + 1) for k, v in enumerate(set([word for d in X for word in self._tokenize(d)]))]))
        print("fitted")
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)
    
    def transform(self, X):
        data = []
        for d in X:
            res = np.zeros(len(self._vocab), dtype=np.float32)
            for word in self._tokenize(d):
                res[self.__get_word_index(word)] += 1
            if self.normalize:
                res = np.log(res + 1)
            
            data.append(res)
        return np.array(data)
    
    def __get_word_index(self, word):
        return self._vocab.get(word, 0)
    

In [3]:
#df = pd.DataFrame(load_from_path('./raw'), columns=['label', 'message'])
data = load_from_path('./raw')

In [10]:
# words = ["""some word that would be splitted and it happened for me every time""".split(" "),
#          """some other thing that should be proccessed""".split(" ")]
# dict([(v, k+1) for k, v in enumerate(set([w for d in words for w in d]))])
df = pd.DataFrame(data, columns=['label', 'content'])
bow = BagOfWords()
x = bow.fit_transform(df['content'])


fitted


In [12]:
print 'samples shape', x.nbytes
print 'vocab len', len(bow._vocab)

samples shape 18860717664
vocab len 139908
