Now let's try multilabel classification for:
 - Yemen crisis - الأزمة_اليمنية
 - Syria crisis - الأزمة_السورية
 - Ukraine crisis - الأزمة_الأوكرانية

Incomplete - older snorkel documentation/tutorials indicate this might work - see for example https://github.com/HazyResearch/snorkel/blob/master/tutorials/advanced/Categorical_Classes.ipynb

and some parts do seem to be working fine, though no current snorkel tutorials offer guidance.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import snorkel

%matplotlib inline
from IPython.core.pylabtools import figsize

In [2]:
media_df = pd.read_csv('/Users/awhite/Documents/snorkel/arabic_news_cleaned.csv')

media_df['labels'] = media_df.category

media_df.loc[media_df.labels.str.contains("اليمنية") == True, 'labels'] = 1
media_df.loc[media_df.labels.str.contains("السورية") == True, 'labels'] = 2
media_df.loc[media_df.labels.str.contains("وكران") == True, 'labels'] = 3
media_df.loc[media_df.labels.str.contains("ا|م") == True, 'labels'] = 4

In [3]:
media_df.category[media_df.labels == 4].value_counts()

كرة_القدم                       4301
جماعات_مسلحة                    2142
أسواق_النفط                    1364
لاجئون                         1256
رياضات_اخرى                     1100
صواريخ                           741
التقنية_والمعلومات               639
فضاء                             630
أسلحة_ومعدات_عسكرية             625
الهجرة_إلى_أوروبا              616
اكتشافات                         577
تفجيرات                          559
مشاهير                           539
البحوث_الطبية                    530
معلومات_عامة                     484
طائرات_حربية                    479
الانتخابات_الأمريكية            447
جرائم                           433
مظاهرات                          422
مؤشرات_اقتصادية                 400
عقوبات_اقتصادية                  397
الاعتراف_بدولة_فلسطين            369
امراض                            323
أولمبياد_ريو_دي_جانيرو_2016     314
انقلاب_تركيا                     305
عملية_تحرير_الموصل               294
هجمات_باريس                      287
مناورات_عسك

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(media_df, test_size = 0.2, random_state = 0)

train, valid = train_test_split(train, test_size = 0.2, random_state = 0)
train, dev = train_test_split(train, test_size = 0.2, random_state = 0)

Y_train = train["labels"].values
Y_dev = dev["labels"].values
Y_valid = valid["labels"].values
Y_test = test["labels"].values

len(train)

14166

In [5]:
from snorkel.labeling.apply import PandasLFApplier
from snorkel.labeling.lf import labeling_function

In [6]:
dev.text[dev.labels == 3].head()

17492    بيسكوف ندخر سيله لاسترداد اموالنا كييف متحدث ص...
1287     مجموعه اتصال خاصه باوكرانيا تجتمع مينسك تعقد ع...
1348     لافروف لنظيره اوكراني وقوف جانبي فيديو اثار تخ...
1215     قائد قوات ناتو اوروبا يدعو استئناف اتصالات منت...
1388     بوتين هولاند يبحثان اوكرانيا اعلن كرملين رئيس ...
Name: text, dtype: object

In [7]:
@labeling_function()
def country(x):
    if re.search(r"يمن",x.text):
        return 1
    elif re.search(r"سوري",x.text):
        return 2
    elif re.search(r"وكران",x.text):
        return 3
    else:
        return 4

@labeling_function()
def entities(x):
    if re.search(r"هادي|حوثي|صنعاء",x.text):
        return 1
    elif re.search(r"اسد|دمشق|ادلب|حلب",x.text):
        return 2
    elif re.search(r"كييف|زيلينسكي|دونباس",x.text):
        return 3
    else:
        return -1

exclude = r"كرة القدم|فلم|افلام"

@labeling_function()
def misc_exclude(x):
    return 4 if re.search(exclude, x.text) else -1

In [8]:
lfs = [country, entities, misc_exclude]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train)
L_dev = applier.apply(df=dev)
L_valid = applier.apply(df=valid)
L_test = applier.apply(df=test)

100%|██████████| 14166/14166 [00:02<00:00, 4833.56it/s]
100%|██████████| 3542/3542 [00:00<00:00, 4890.26it/s]
100%|██████████| 4428/4428 [00:00<00:00, 4816.79it/s]
100%|██████████| 5534/5534 [00:01<00:00, 4894.35it/s]


In [9]:
# Commented out because doesn't work for multiclass

# from snorkel.labeling.analysis import LFAnalysis

# LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

Okay -- Let's do some manual testing to be sure this is working.

In [10]:
L_dev_df = pd.DataFrame(L_dev)
Y_dev_df = pd.DataFrame(Y_dev)
Y_dev_df = Y_dev_df.rename(columns = {0:'gold'})
dev_df = L_dev_df.join(Y_dev_df)

In [11]:
dev_df.groupby([0, 'gold']).size()

0  gold
1  1         65
   2         16
   3          2
   4        156
2  1          6
   2        535
   3         14
   4        357
3  2          1
   3         61
   4         54
4  1          7
   2          9
   3          3
   4       2256
dtype: int64

In [12]:
dev_df.groupby([1, 'gold']).size()

1   gold
-1  1         26
    2        355
    3         32
    4       2661
 1  1         50
    2         18
    4         34
 2  1          2
    2        183
    3          2
    4         99
 3  2          5
    3         46
    4         29
dtype: int64

In [13]:
dev_df.groupby([2, 'gold']).size()

2   gold
-1  1         78
    2        561
    3         79
    4       2798
 4  3          1
    4         25
dtype: int64

Will the models work?

In [14]:
# from snorkel.labeling.model import MajorityLabelVoter

# majority_model = MajorityLabelVoter()
# Y_pred_train = majority_model.predict(L=L_train)

# majority_acc = majority_model.score(L=L_valid, Y=Y_valid)["accuracy"]
# print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

In [15]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=5, verbose=True)
label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123)

label_model_acc = label_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Label Model Accuracy:     80.5%


In [16]:
from snorkel.labeling.utils import filter_unlabeled_dataframe
from snorkel.analysis.utils import probs_to_preds

Y_probs_train = label_model.predict_proba(L=L_train)

train_filtered, Y_probs_train_filtered = filter_unlabeled_dataframe(
    X=train, y=Y_probs_train, L=L_train)

Y_preds_train_filtered = probs_to_preds(probs=Y_probs_train_filtered)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

words_train = [row.text for i, row in train_filtered.iterrows()]
words_valid = [row.text for i, row in valid.iterrows()]
words_test = [row.text for i, row in test.iterrows()]

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(2,2))
X_train = vectorizer.fit_transform(words_train).toarray()
X_valid = vectorizer.transform(words_valid).toarray()
X_test = vectorizer.transform(words_test).toarray()

In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier().fit(X_train, Y_preds_train_filtered)



In [20]:
Y_pred = classifier.predict(X_test)
pd.crosstab(Y_test, Y_pred, rownames=['Actual Label'], colnames=['Predicted Label'])

Predicted Label,1,2,3,4
Actual Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,81,19,0,12
2,8,793,2,42
3,1,17,78,17
4,100,644,75,3645
