### A basic random forest model on top of vectorized text

In [37]:
import re
import string
import nltk
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

#### Data

In [None]:
messages = pd.read_csv('../../../data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]

### 1. tf-idf

In [5]:
stopwords = nltk.corpus.stopwords.words('english')

#### Cleaning the text

In [7]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [8]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(messages['text'])

X_features = pd.DataFrame(X_tfidf.toarray())
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9385,9386,9387,9388,9389,9390,9391,9392,9393,9394
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### RandomForestClassifier (Baseline)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_features, messages['label'], test_size=0.2)

In [18]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

#### Evaluate

In [19]:
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
print(f'Precision: {round(precision, 3)} / Recall: {round(recall, 3)}')

Precision: 1.0 / Recall: 0.791


### 2 . word2vect

In [22]:
import gensim

In [23]:
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train, size=100, window=5, min_count=2)

In [24]:
messages

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[this, is, the, nd, time, we, have, tried, con..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[will, ì_, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, was, in, mood, for, that, so, any, othe..."
5570,ham,The guy did some bitching but I acted like i'd...,"[the, guy, did, some, bitching, but, acted, li..."


In [34]:
w2v_model.wv.index2word

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'of',
 'call',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'we',
 'or',
 'not',
 'but',
 'do',
 'at',
 'will',
 'if',
 'get',
 'be',
 'with',
 'ur',
 'just',
 'no',
 'this',
 'gt',
 'lt',
 'up',
 'how',
 'when',
 'ok',
 'go',
 'free',
 'what',
 'from',
 'out',
 'll',
 'all',
 'know',
 'am',
 'day',
 'then',
 'like',
 'got',
 'was',
 'there',
 'he',
 'only',
 'good',
 'come',
 'its',
 'time',
 'want',
 'text',
 'love',
 'send',
 'one',
 'txt',
 'going',
 'about',
 'need',
 'she',
 'as',
 'lor',
 'home',
 'by',
 'don',
 'today',
 'still',
 'see',
 'back',
 'sorry',
 'da',
 'stop',
 'our',
 'dont',
 'did',
 'reply',
 'mobile',
 'new',
 'hi',
 'please',
 'any',
 'phone',
 'some',
 'take',
 'her',
 'they',
 'later',
 'pls',
 'been',
 'think',
 'tell',
 'oh',
 'well',
 'week',
 'here',
 're',
 'more',
 'dear',
 'an',
 'who',
 'ì_',
 'him',
 'much',
 'has',
 'great',
 'where',
 'night',
 'give',
 

In [49]:
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index2word])
                     for ls in X_test])

In [71]:
w2v_vect_avg = [vect.mean(axis=0) if len(vect) != 0 else np.zeros(100) for vect in w2v_vect]
w2v_vect_avg

[array([ 0.31688407, -0.45545423, -0.5703304 , -0.10673078, -0.3479894 ,
        -0.01268769,  0.07814769,  0.05860218,  0.11251561, -0.00581547,
         0.39875624,  0.4491599 ,  0.6211467 , -0.19315946, -0.3691419 ,
        -0.09113137, -0.21653764,  0.36918452,  0.09965802,  0.2767311 ,
         0.1436181 , -0.50226283,  0.53279626, -0.37200117, -0.18080692,
         0.21476133, -0.62412703, -0.24201845,  0.30174604,  0.2111233 ,
        -0.01011244, -0.05136384,  0.11488385, -0.13058919, -0.3133261 ,
        -0.18110053, -0.5786415 , -0.39272323,  0.7719334 , -0.4302709 ,
        -0.04812403, -0.4560932 , -0.48087582,  0.08476645, -0.14566155,
         0.07941215, -0.22717701, -0.05782105,  0.13107358,  0.30665493,
         0.34137106,  0.37447134,  0.12547351, -0.37262392, -0.07221345,
        -0.23866282, -0.60282433, -0.07476885,  0.4475731 ,  0.5326087 ,
        -0.17902593, -0.11300224,  0.07676619, -0.17888601,  0.52975935,
         0.44082677,  0.3766135 ,  0.19098681, -0.2