# Init libs and functions

In [8]:
import os
import time

import numpy as np
import pandas as pd
import plotly.express as px
from scipy.spatial import distance

from sklearn.metrics import classification_report

import transformers
from transformers import pipeline

import sentence_transformers 
from sentence_transformers import SentenceTransformer

In [2]:
semantic_classifier_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=968.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=3786.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=645.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=122.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=229.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=470693617.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=53.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=5069051.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=239.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=9081518.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=480.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=14763234.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=190.0), HTML(value='')))




In [25]:
"""
df - data with labeled data (columns: data, label)
candidate labels - list with zero-shot classes
classifier - function to classify with two arguments: text, list of classes
threshold - acceptable confidence of classification, if lower then marked as advertisement
debug - how often index is printed
adv_labels - avertisement labels in df
cand_adv_index - indexes of avertisement classes from candidate labels list
debug_all - print information about misclassified data

"""
def check_candidates_multilabeled(df, candidate_labels, classifier, threshold, debug, avd_labels, cand_adv_index, debug_all = False):
    new_true = list() # объединяем все нерекламные классы в один, реклама в другой
    pred_true = list()
    not_catched = list()
    for index, row in df.iterrows():
        if index % debug == 0:
            print('index = ', index)    
        
        val = 0
        for true_avd_label in avd_labels:
            if true_avd_label in row['label']: # если хотя бы один рекламный тэг
                val = 1
                # print('YES', true_avd_label, 'in', row['label'])
                break
        new_true.append(val)
        
        res = classifier(row['data'], candidate_labels)
        if res['scores'][0] > threshold:
            val = 0
            for ind in cand_adv_index:
                if res['labels'][0] == candidate_labels[ind]: # равен лейблу, который относится к рекламному (food, ...)
                    val = 1
                    break
            pred_true.append(val)
        else:
            pred_true.append(1) # не хватило уверенности в классификации события, помечаем как реклама
        
        if new_true[-1] == 1 and pred_true[-1] == 0:
            not_catched.append(index)
        
        #debug
        if new_true[-1] != pred_true[-1]:
            if debug_all:
                content = res['sequence'].replace('\n','')
                print(f"{row['label']} orig: {new_true[-1]} ({row['label']}) pred: {pred_true[-1]} ({res['labels'][0]}) {content}")
    return new_true, pred_true, not_catched

In [5]:
def classifier_semantic(text, targets, model = semantic_classifier_model):
    target_emb = model.encode(targets)
    text_emb = model.encode(text)
    
    probs = [ 1 - distance.cosine(text_emb, t) for t in target_emb]
    probs = probs / sum(probs)
    
    response = dict()
    for p, t in zip(probs, targets):
        response[t] = p 
    
    temp = dict(sorted(response.items(), key=lambda item: item[1], reverse=True))
    response = dict()
    response['sequence'] = text
    response['scores'] = list(temp.values())
    response['labels'] = list(temp.keys())
    
    return response

In [6]:
classifier_semantic('запись в директ', ['услуги', 'запись', 'описание события'])

{'sequence': 'запись в директ',
 'scores': [0.477551522198239, 0.2766563765912658, 0.24579210121049522],
 'labels': ['запись', 'описание события', 'услуги']}

# Load dataset

In [11]:
df_lp = pd.read_csv('data_toloka.csv')

In [12]:
df_lp['data'] = df_lp['caption']

In [13]:
# delete incorrect "other"
def delete_tag(x, tag='other'):
    x = x.replace('#' + tag, '').replace('_other','TEMP').replace('other','').replace('TEMP','_other')
    if x[0] == '#':
        return x[1:]
    else:
        return x

In [14]:
df_lp.label[df_lp.label.str.contains('#other') | df_lp.label.str.contains('other#')] = df_lp[df_lp.label.str.contains('#other') | df_lp.label.str.contains('other#')].label.apply(delete_tag)

In [15]:
# delete trash
df_cleaned = df_lp[~df_lp.label.str.contains('trash')]

In [17]:
df_cleaned.shape

(2987, 3)

# Test

In [20]:
adver_labs = ['adv_event', 'adv_other', 'food', 'other', 'retrospective_event', 'future_events' ]

In [21]:
candidate_labs = [
    'other',
    'food',
    'advertisement spam promotion',
    'music concert', 
    'exhibition', 
    'festival',
    'conference',
    'calendar holiday',
    'sport event',
    'flashmob', 
    'accident',
    'stroll walking',
    'wedding birthday',
    'event']

In [22]:
y_tr, y_pred, missed_adv = check_candidates_multilabeled(df_cleaned, candidate_labs, classifier_semantic, 0, 1000, adver_labs,[0, 1, 2], False)
report = classification_report(y_tr, y_pred, target_names=['не реклама', 'реклама'])
print(report)

index =  0
index =  2000
index =  4000
index =  5000
              precision    recall  f1-score   support

  не реклама       0.32      0.94      0.48       673
     реклама       0.96      0.41      0.58      2314

    accuracy                           0.53      2987
   macro avg       0.64      0.68      0.53      2987
weighted avg       0.82      0.53      0.55      2987



Best recall for class 1

In [23]:
candidate_labs = [
    'other',
    'food',
    'advertisement spam',
    'music concert', 
    'exhibition', 
    'festival',
    'conference',
    'calendar holiday',
    'sport event',
    'flashmob', 
    'accident',
    'stroll walking',
    'wedding birthday',
    'private event',
    'public event']

In [24]:
y_tr, y_pred, missed_adv = check_candidates_multilabeled(df_cleaned, candidate_labs, classifier_semantic, 0, 1000, adver_labs,[0, 1, 2], False)
report = classification_report(y_tr, y_pred, target_names=['не реклама', 'реклама'])
print(report)

index =  0
index =  2000
index =  4000
index =  5000
              precision    recall  f1-score   support

  не реклама       0.33      0.93      0.49       673
     реклама       0.96      0.46      0.62      2314

    accuracy                           0.56      2987
   macro avg       0.65      0.69      0.56      2987
weighted avg       0.82      0.56      0.59      2987



Best precision with higher recall for class 2

In [26]:
candidate_labs = [
    'other',
    'food',
    'advertisement',
    'spam',
    'promotion',
    'music concert', 
    'exhibition', 
    'festival',
    'conference',
    'calendar holiday',
    'sport event',
    'flashmob', 
    'accident',
    'stroll walking',
    'wedding birthday',
    'private event',
    'public event']

In [27]:
y_tr, y_pred, missed_adv = check_candidates_multilabeled(df_cleaned, candidate_labs, classifier_semantic, 0, 1000, adver_labs,[0, 1, 2, 3, 4], False)
report = classification_report(y_tr, y_pred, target_names=['не реклама', 'реклама'])
print(report)

index =  0
index =  2000
index =  4000
index =  5000
              precision    recall  f1-score   support

  не реклама       0.35      0.84      0.50       673
     реклама       0.92      0.55      0.69      2314

    accuracy                           0.62      2987
   macro avg       0.64      0.70      0.60      2987
weighted avg       0.79      0.62      0.65      2987



Best f-score for class 1