# FlyingSquid

In [1]:
#pip install sentence_transformers

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# load data
train_df = pd.read_csv('../tutorials/data/yt_comments_train.csv')
tmp_df = pd.read_csv('../tutorials/data/yt_comments_test.csv')
dev_df, test_df = train_test_split(tmp_df, test_size=0.5)

# preview data
train_df.head()

Unnamed: 0.1,Unnamed: 0,author,date,text,label,video
0,0,Alessandro leite,2014-11-05T22:21:36,pls http://www10.vakinha.com.br/VaquinhaE.aspx...,-1.0,1
1,1,Salim Tayara,2014-11-02T14:33:30,"if your like drones, plz subscribe to Kamal Ta...",-1.0,1
2,2,Phuc Ly,2014-01-20T15:27:47,go here to check the views :3﻿,-1.0,1
3,3,DropShotSk8r,2014-01-19T04:27:18,"Came here to check the views, goodbye.﻿",-1.0,1
4,4,css403,2014-11-07T14:25:48,"i am 2,126,492,636 viewer :D﻿",-1.0,1


In [44]:
import rubrix as rb

# build records from the train dataset
records = [
    rb.TextClassificationRecord(
        inputs=row.text,
        metadata={"video":row.video, "author": row.author}
    )
    for i,row in train_df.iterrows()
]

# build records from the test dataset with annotation
labels = ["HAM", "SPAM"]
records += [
    rb.TextClassificationRecord(
        inputs=row.text,
        annotation=labels[row.label],
        metadata={"video":row.video, "author": row.author}
    )
    for i,row in dev_df.iterrows()
]

# log records to Rubrix
rb.log(records, name="weak_supervision_yt")

  0%|          | 0/1711 [00:00<?, ?it/s]

1711 records logged to http://localhost:6900/ws/rubrix/weak_supervision_yt


BulkResponse(dataset='weak_supervision_yt', processed=1711, failed=0)

In [3]:
from rubrix.labeling.text_classification import Rule, WeakLabels
import rubrix as rb

#  rules defined as Elasticsearch queries
check_out = Rule(query="check out", label="SPAM")
plz = Rule(query="plz OR please", label="SPAM")
subscribe = Rule(query="subscribe", label="SPAM")
my = Rule(query="my", label="SPAM")
song = Rule(query="song", label="HAM")
love = Rule(query="love", label="HAM")

import re

# rules defined as Python labeling functions
def contains_http(record: rb.TextClassificationRecord):
    if "http" in record.inputs["text"]:
        return "SPAM"

def short_comment(record: rb.TextClassificationRecord):
    return "HAM" if len(record.inputs["text"].split()) < 5 else None

def regex_check_out(record: rb.TextClassificationRecord):
    return "SPAM" if re.search(r"check.*out", record.inputs["text"], flags=re.I) else None

from rubrix.labeling.text_classification import load_rules

# bundle our rules in a list
rules = [check_out, plz, subscribe, my, song, love, contains_http, short_comment, regex_check_out]

# optionally add the rules defined in the web app UI
rules += load_rules(dataset="weak_supervision_yt")

# apply the rules to a dataset to obtain the weak labels
weak_labels = WeakLabels(
    rules=rules,
    dataset="weak_supervision_yt"
)

from rubrix.labeling.text_classification import FlyingSquid

# we pass our WeakLabels instance to our FlyingSquid label model
flyingsquid_model = FlyingSquid(weak_labels)

# we fit the model
flyingsquid_model.fit()

flyingsquid_model.score(tie_break_policy="random")
# {'accuracy': 0.832, ...}

  from cryptography.utils import int_from_bytes, int_to_bytes


Preparing rules:   0%|          | 0/9 [00:00<?, ?it/s]

Applying rules:   0%|          | 0/1711 [00:00<?, ?it/s]



{'accuracy': 0.864,
 'micro_precision': 0.864,
 'micro_recall': 0.864,
 'micro_f1': 0.864,
 'macro_precision': 0.8635432667690732,
 'macro_recall': 0.8688311688311688,
 'macro_f1': 0.8634406529143371,
 'precision_SPAM': 0.9206349206349206,
 'recall_SPAM': 0.8285714285714286,
 'f1_SPAM': 0.8721804511278196,
 'support_SPAM': 70,
 'precision_HAM': 0.8064516129032258,
 'recall_HAM': 0.9090909090909091,
 'f1_HAM': 0.8547008547008547,
 'support_HAM': 55}

In [46]:
# get your training records with the predictions of the label model
records_for_training = flyingsquid_model.predict()

# log the records to a new dataset in Rubrix
rb.log(records_for_training, name="flyingsquid_results")

# extract training data
training_data = pd.DataFrame(
    [
        {"text": rec.inputs["text"], "label": flyingsquid_model.weak_labels.label2int[rec.prediction[0][0]]}
        for rec in records_for_training
    ]
)

  0%|          | 0/1177 [00:00<?, ?it/s]

1177 records logged to http://localhost:6900/ws/rubrix/flyingsquid_results


In [47]:
# preview training data
training_data

Unnamed: 0,text,label
0,EVERYONE PLEASE SUBSCRIBE TO MY CHANNEL OR CAN...,0
1,"<a rel=""nofollow"" class=""ot-hashtag"" href=""htt...",1
2,PARTY ROCK (8) ~﻿,1
3,This song is true because it is insane because...,1
4,https://binbox.io/DNCkM#qT4Q1JB1﻿,0
...,...,...
1172,Maybe no one will probably read this. But just...,0
1173,Check out this video on YouTube<br /><br /><br...,0
1174,Lol check out my chanell and subscribe please ...,0
1175,https://www.facebook.com/eeccon/posts/73394924...,0


In [32]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# define our final classifier
classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# fit the classifier
classifier.fit(
    X=training_data.text.tolist(),
    y=training_data.label.values
)

Pipeline(steps=[('vect', CountVectorizer()), ('clf', MultinomialNB())])

In [48]:
# compute the test accuracy
accuracy = classifier.score(
    X=test_df.text.tolist(),
    y=test_df.label.tolist()
)

print(f"Test accuracy: {accuracy}")

Test accuracy: 0.16


# Epoxy

In [22]:
from sentence_transformers import SentenceTransformer
from rubrix.labeling.text_classification import WeakLabelsEmbeddings

class SentenceTransformerModel(object):
    
    def __init__(self, embedding_model_name):
        self.embedding_model = SentenceTransformer(embedding_model_name)
    
    def __call__(self, records):
        texts = [ x.inputs["text"] for x in records ]
        embeddings = self.embedding_model.encode(texts)
        return embeddings

sentence_transformer_model = SentenceTransformerModel('average_word_embeddings_glove.840B.300d')

weak_labels = WeakLabelsEmbeddings(
    rules=rules,
    dataset="weak_supervision_yt",
    embedding_func=sentence_transformer_model
)

ImportError: cannot import name 'WeakLabelsEmbeddings' from 'rubrix.labeling.text_classification' (/mnt/d/Recognai/rubrix/src/rubrix/labeling/text_classification/__init__.py)

In [9]:
from rubrix.labeling.text_classification import Epoxy
thresholds = [1.0] * weak_labels.matrix().shape[0]
epoxy_instance = Epoxy(weak_labels, thresholds=thresholds)

In [None]:
import pdb
pdb.run("epoxy_instance.fit()")

In [None]:
epoxy_instance.score()