In [1]:
import re
import numpy as np
import pandas as pd
import sklearn
import joblib

In [2]:
df = pd.read_csv('../data/emails.csv')

In [3]:
def preprocess(text):
    text = text.lower();
    text = re.sub('[+-]?((\d+\.?\d*)|(\.\d+))', 'number', text)
    text = re.sub('[$]+', 'dollar', text)
    return text;


In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

In [5]:
def tokenize_and_stem(text):
    tokenizer = RegexpTokenizer(r'\w+')
    snow_stemmer = SnowballStemmer(language='english')
    
    words = tokenizer.tokenize(text)
    stemmed_words = []
    for w in words:
        if len(w) < 2:
            continue;
        stemmed_words.append(snow_stemmer.stem(w))
    
    return stemmed_words
    

In [6]:
def count_vec_analyzer(data):
    data =  preprocess(data)
    return tokenize_and_stem(data)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
corpus = df.text
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(analyzer=count_vec_analyzer)),
    ('tfidf', TfidfTransformer())
])

pipeline.fit(corpus)


In [8]:
corpus_tfidf = pipeline.transform(corpus)
transformed_df = pd.concat([df, pd.DataFrame(corpus_tfidf.toarray())], axis=1)
transformed_df.drop('text', axis=1, inplace=True)

In [9]:
from sklearn.model_selection import train_test_split
train, remainder = train_test_split(transformed_df, train_size=0.8,stratify=transformed_df.spam)

In [10]:
validate, test = train_test_split(remainder, train_size=0.5, stratify=remainder.spam)

In [11]:
train.to_csv('data/train.csv', index=False)
validate.to_csv('data/validate.csv', index=False)
test.to_csv('data/test.csv', index=False)