Now let's try multilabel classification for:
 - Yemen crisis - الأزمة_اليمنية
 - Syria crisis - الأزمة_السورية
 - Ukraine crisis - الأزمة_الأوكرانية

Older snorkel documentation/tutorials indicate this might work - see for example https://github.com/HazyResearch/snorkel/blob/master/tutorials/advanced/Categorical_Classes.ipynb

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import snorkel

%matplotlib inline
from IPython.core.pylabtools import figsize

In [66]:
media_df = pd.read_csv('/Users/awhite/Documents/snorkel/arabic_news_cleaned.csv')

media_df['labels'] = media_df.category

media_df.loc[media_df.labels.str.contains("اليمنية") == True, 'labels'] = 1
media_df.loc[media_df.labels.str.contains("السورية") == True, 'labels'] = 2
media_df.loc[media_df.labels.str.contains("وكران") == True, 'labels'] = 3
media_df.loc[media_df.labels.str.contains("ا|م") == True, 'labels'] = 0


In [67]:
media_df.labels.value_counts()

0    22347
2     4162
3      595
1      566
Name: labels, dtype: int64

In [68]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(media_df, test_size = 0.2, random_state = 0)

train, valid = train_test_split(train, test_size = 0.2, random_state = 0)
train, dev = train_test_split(train, test_size = 0.2, random_state = 0)

Y_train = train["labels"].values
Y_dev = dev["labels"].values
Y_valid = valid["labels"].values
Y_test = test["labels"].values

len(train)

14166

In [70]:
from snorkel.labeling.apply import PandasLFApplier
from snorkel.labeling.lf import labeling_function

In [71]:
dev.text[dev.labels == 3].head()

17492    بيسكوف ندخر سيله لاسترداد اموالنا كييف متحدث ص...
1287     مجموعه اتصال خاصه باوكرانيا تجتمع مينسك تعقد ع...
1348     لافروف لنظيره اوكراني وقوف جانبي فيديو اثار تخ...
1215     قائد قوات ناتو اوروبا يدعو استئناف اتصالات منت...
1388     بوتين هولاند يبحثان اوكرانيا اعلن كرملين رئيس ...
Name: text, dtype: object

In [72]:
@labeling_function()
def country(x):
    if re.search(r"يمن",x.text):
        return 1
    elif re.search(r"سوري",x.text):
        return 2
    elif re.search(r"وكران",x.text):
        return 3
    else:
        return 0

@labeling_function()
def entities(x):
    if re.search(r"هادي|حوثي|صنعاء",x.text):
        return 1
    elif re.search(r"اسد|دمشق|ادلب|حلب",x.text):
        return 2
    elif re.search(r"كييف|زيلينسكي|دونباس",x.text):
        return 3
    else:
        return -1

In [73]:
lfs = [country, entities]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train)
L_dev = applier.apply(df=dev)
L_valid = applier.apply(df=valid)
L_test = applier.apply(df=test)

100%|██████████| 14166/14166 [00:02<00:00, 5470.58it/s]
100%|██████████| 3542/3542 [00:00<00:00, 5723.61it/s]
100%|██████████| 4428/4428 [00:00<00:00, 5669.73it/s]
100%|██████████| 5534/5534 [00:00<00:00, 5708.08it/s]


In [74]:
from snorkel.labeling.analysis import LFAnalysis

LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

ValueError: Classification metrics can't handle a mix of unknown and multiclass targets

Okay -- Let's do some manual testing to be sure this is working.

In [91]:
L_dev_df = pd.DataFrame(L_dev)
Y_dev_df = pd.DataFrame(Y_dev)
Y_dev_df = Y_dev_df.rename(columns = {0:'gold'})
dev_df = L_dev_df.join(Y_dev_df)

In [95]:
dev_df.groupby([0, 'gold']).size()

0  gold
0  0       2256
   1          7
   2          9
   3          3
1  0        156
   1         65
   2         16
   3          2
2  0        357
   1          6
   2        535
   3         14
3  0         54
   2          1
   3         61
dtype: int64

In [96]:
dev_df.groupby([1, 'gold']).size()

1   gold
-1  0       2661
    1         26
    2        355
    3         32
 1  0         34
    1         50
    2         18
 2  0         99
    1          2
    2        183
    3          2
 3  0         29
    2          5
    3         46
dtype: int64

Will the models work?

In [99]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
Y_pred_train = majority_model.predict(L=L_train)

majority_acc = majority_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

IndexError: index 2 is out of bounds for axis 0 with size 2

In [101]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=4, verbose=True)
label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123)

label_model_acc = label_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Label Model Accuracy:     80.5%


In [None]:
df_train_filtered, Y_probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=Y_probs_train, L=L_train
)