# Library

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import gensim
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/brunolw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Datasets

In [2]:
data = pd.read_csv('./data/SPAM text message 20170820 - Data.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Split data to train/test

In [3]:
x_train, x_test, y_train, y_test = train_test_split(data.Message, data.Category, test_size=0.2, random_state=123, stratify=data.Category)

In [4]:
x_train.shape

(4457,)

In [5]:
y_train.shape

(4457,)

# BoW

In [6]:
bow_2gram_clf = Pipeline([('vec', CountVectorizer(ngram_range=(2,2), token_pattern=r'[A-Za-z_]+')),
                         ('clf', MultinomialNB())])
cvt = CountVectorizer(ngram_range=(2,2), token_pattern=r'[A-Za-z_]+')
x_vec = cvt.fit_transform(x_train)
scores = cross_val_score(MultinomialNB(), x_vec, y_train, cv=5)

In [7]:
x_vec

<4457x34734 sparse matrix of type '<class 'numpy.int64'>'
	with 64863 stored elements in Compressed Sparse Row format>

In [8]:
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.862 (+/- 0.018)


# Word2Vec

In [10]:
class MySentences(object):
    """
    MySentences is a generator to produce a list of tkenized sentences
    
    Takes a list of numpy arrays countaining documents.
    
    Args:
        arrays: List of arrays, where earch element in the array contains a document.
    """
    
    def __init__(self, *arrays):
        self.arrays = arrays
        
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)
                    
def get_word2vec(sentences, location):
    """
    Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        location(str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [11]:
w2vec = get_word2vec(
    MySentences(
        data.Message.values, 
        #df_test['Text'].values  Commented for Kaggle limits
    ),
    'w2vmodel'
)

Found w2vmodel


In [12]:
import numpy as np

class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [13]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)
mean_embedded = mean_embedding_vectorizer.fit_transform(data.Message)

  self.dim = len(word2vec.wv.syn0[0])
  return np.array(transformed_X)


In [14]:
x_train, x_test, y_train, y_test = train_test_split(mean_embedded, data.Category, test_size=0.2, random_state=123, stratify=data.Category)

In [19]:
x_train = x_train - np.min(x_train)

In [20]:
scores = cross_val_score(MultinomialNB(), x_train, y_train, cv=5)

In [21]:
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.866 (+/- 0.001)


# BoW(+TFIDF)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(data.Message, data.Category, test_size=0.2, random_state=123, stratify=data.Category)
bow_2gram_clf = Pipeline([('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
cvt = CountVectorizer(ngram_range=(2,2), token_pattern=r'[A-Za-z_]+')
x_vec = cvt.fit_transform(x_train)
scores = cross_val_score(bow_2gram_clf, x_vec, y_train, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.939 (+/- 0.013)


# Word2Vec(+TFIDF)

In [32]:
x_train, x_test, y_train, y_test = train_test_split(mean_embedded, data.Category, test_size=0.2, random_state=123, stratify=data.Category)
x_train = x_train - np.min(x_train)
word2vec_clf = Pipeline([('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB())])
scores = cross_val_score(word2vec_clf, x_train, y_train, cv=5)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.866 (+/- 0.001)
