In [1]:
from utils import load_unlabeled_spam_dataset

## Data setup

In [2]:
df_train = load_unlabeled_spam_dataset()

In [3]:
df_train

Unnamed: 0,author,date,text
0,Alessandro leite,2014-11-05T22:21:36,pls http://www10.vakinha.com.br/VaquinhaE.aspx...
1,Salim Tayara,2014-11-02T14:33:30,"if your like drones, plz subscribe to Kamal Ta..."
2,Phuc Ly,2014-01-20T15:27:47,go here to check the views :3﻿
3,DropShotSk8r,2014-01-19T04:27:18,"Came here to check the views, goodbye.﻿"
4,css403,2014-11-07T14:25:48,"i am 2,126,492,636 viewer :D﻿"
...,...,...,...
365,Benjy Growls,2015-05-27T09:27:26.667000,I love this song so much &lt;3<br />Keep em&#3...
366,Pyles Baxter,2013-10-03T02:25:19.324000,Perhaps you have seen the newest Miley Cyrus...
367,bilal bilo,2015-05-22T20:36:36.926000,I liked<br />﻿
368,TheEpicMixx':)x,2013-08-08T14:54:45.831000,Please.. Check my channel out:) I subscribe ba...


## Labelling Functions

In [4]:
# Define the label mappings for convenience
ABSTAIN = -1
NOT_SPAM = 0
SPAM = 1

In [5]:
#KEYWORD MATCHES
from snorkel.labeling import labeling_function


@labeling_function()
def lf_keyword_my(x):
    """Many spam comments talk about 'my channel', 'my video', etc."""
    return SPAM if "my" in x.text.lower() else ABSTAIN

In [6]:
#REGULAR EXPRESSION BASED
import re

@labeling_function()
def lf_regex_check_out(x):
    """Spam comments say 'check out my video', 'check it out', etc."""
    return SPAM if re.search(r"check.*out", x.text, flags=re.I) else ABSTAIN

In [7]:
# ARBITRARY HEURITIC

@labeling_function()
def lf_short_comment(x):
    """Non-spam comments are often short, such as 'cool video!'."""
    return NOT_SPAM if len(x.text.split()) < 5 else ABSTAIN

In [8]:
# THIRD PARTY MODEL

from textblob import TextBlob


@labeling_function()
def lf_textblob_polarity(x):
    """
    We use a third-party sentiment classification model, TextBlob.

    We combine this with the heuristic that non-spam comments are often positive.
    """
    return NOT_SPAM if TextBlob(x.text).sentiment.polarity > 0.3 else ABSTAIN

In [9]:
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

  from pandas import Panel
100%|██████████| 1956/1956 [00:00<00:00, 3464.55it/s]


In [10]:
df_train = df_train[df_train.label != ABSTAIN]

In [11]:
df_train

Unnamed: 0,author,date,text,label
5,Giang Nguyen,2014-11-06T04:55:41,https://www.facebook.com/teeLaLaLa﻿,0
12,Malin Linford,2014-11-05T01:13:43,Hey guys please check out my new Google+ page ...,1
14,Олег Пась,2014-11-03T23:29:00,Plizz withing my channel ﻿,1
17,Rancy Gaming,2014-11-06T09:41:07,What free gift cards? Go here http://www.swag...,0
18,Bishwaroop Bhattacharjee,2014-11-08T12:34:11,https://www.facebook.com/SchoolGeniusNITS/phot...,0
...,...,...,...,...
365,Benjy Growls,2015-05-27T09:27:26.667000,I love this song so much &lt;3<br />Keep em&#3...,0
366,Pyles Baxter,2013-10-03T02:25:19.324000,Perhaps you have seen the newest Miley Cyrus...,1
367,bilal bilo,2015-05-22T20:36:36.926000,I liked<br />﻿,0
368,TheEpicMixx':)x,2013-08-08T14:54:45.831000,Please.. Check my channel out:) I subscribe ba...,1


## Data Augmentation

In [12]:
import random

import nltk
from nltk.corpus import wordnet as wn

from snorkel.augmentation import transformation_function

In [13]:
nltk.download("wordnet", quiet=True)

True

In [14]:
def get_synonyms(word):
    """Get the synonyms of word from Wordnet."""
    lemmas = set().union(*[s.lemmas() for s in wn.synsets(word)])
    return list(set(l.name().lower().replace("_", " ") for l in lemmas) - {word})

In [15]:
@transformation_function()
def tf_replace_word_with_synonym(x):
    """Try to replace a random word with a synonym."""
    words = x.text.lower().split()
    idx = random.choice(range(len(words)))
    synonyms = get_synonyms(words[idx])
    if len(synonyms) > 0:
        x.text = " ".join(words[:idx] + [synonyms[0]] + words[idx + 1 :])
        return x

In [16]:
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier

tf_policy = ApplyOnePolicy(n_per_original=2, keep_original=True)
tf_applier = PandasTFApplier([tf_replace_word_with_synonym], tf_policy)
df_train_augmented = tf_applier.apply(df_train)

100%|██████████| 1387/1387 [00:03<00:00, 439.58it/s]


In [17]:
df_train_augmented

Unnamed: 0,author,date,text,label
5,Giang Nguyen,2014-11-06T04:55:41,https://www.facebook.com/teeLaLaLa﻿,0
12,Malin Linford,2014-11-05T01:13:43,Hey guys please check out my new Google+ page ...,1
12,Malin Linford,2014-11-05T01:13:43,hey guys delight check out my new google+ page...,1
14,Олег Пась,2014-11-03T23:29:00,Plizz withing my channel ﻿,1
17,Rancy Gaming,2014-11-06T09:41:07,What free gift cards? Go here http://www.swag...,0
...,...,...,...,...
367,bilal bilo,2015-05-22T20:36:36.926000,I liked<br />﻿,0
368,TheEpicMixx':)x,2013-08-08T14:54:45.831000,Please.. Check my channel out:) I subscribe ba...,1
368,TheEpicMixx':)x,2013-08-08T14:54:45.831000,please.. check my channel out:) ane subscribe ...,1
369,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0


In [18]:
df_train_augmented.shape

(2701, 4)

In [19]:
df_train_augmented

Unnamed: 0,author,date,text,label
5,Giang Nguyen,2014-11-06T04:55:41,https://www.facebook.com/teeLaLaLa﻿,0
12,Malin Linford,2014-11-05T01:13:43,Hey guys please check out my new Google+ page ...,1
12,Malin Linford,2014-11-05T01:13:43,hey guys delight check out my new google+ page...,1
14,Олег Пась,2014-11-03T23:29:00,Plizz withing my channel ﻿,1
17,Rancy Gaming,2014-11-06T09:41:07,What free gift cards? Go here http://www.swag...,0
...,...,...,...,...
367,bilal bilo,2015-05-22T20:36:36.926000,I liked<br />﻿,0
368,TheEpicMixx':)x,2013-08-08T14:54:45.831000,Please.. Check my channel out:) I subscribe ba...,1
368,TheEpicMixx':)x,2013-08-08T14:54:45.831000,please.. check my channel out:) ane subscribe ...,1
369,Katie Mettam,2013-07-13T13:27:39.441000,I love this song because we sing it at Camp al...,0


## Data Slicing

In [20]:
from snorkel.slicing import slicing_function

In [21]:
def short_comment(x):
    """Ham comments are often short, such as 'cool video!'"""
    return len(x.text.split()) < 5


sfs = [short_comment]

## Training a classifier

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [23]:
train_text = df_train_augmented.text.tolist()
X_train = CountVectorizer(ngram_range=(1, 2)).fit_transform(train_text)

clf = LogisticRegression(solver="lbfgs")
clf.fit(X=X_train, y=df_train_augmented.label.values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)