In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import snorkel

%matplotlib inline
from IPython.core.pylabtools import figsize

In [2]:
from snorkel.labeling.apply import PandasLFApplier
from snorkel.labeling.lf import labeling_function

POS = 1
NEG = -1
ABSTAIN = 0

In [3]:
media_df = pd.read_csv('/Users/awhite/Documents/snorkel/arabic_news_cleaned.csv')

media_df = media_df.assign(syria = media_df.category.str.contains("سورية") == True)
media_df.syria = media_df.syria.replace({True:1,False:-1})

media_df[media_df.syria == 1].head()

Unnamed: 0,text,category,syria
1479,روحاني سوريا اخير جددت طهران تاكيد ستواصل لدمش...,الأزمة_السورية,1
1480,اشنطن ترفض تعاون مكافحه ارهاب اعلنت متحدثه باس...,الأزمة_السورية,1
1481,دمشق تطلب موسكو تنظيم جوله مشاورات ثالثه معارض...,الأزمة_السورية,1
1482,صحيفه جمهوريت توكد تورط تركيا ادخال مسلحين سور...,الأزمة_السورية,1
1483,امكانيه روسيا يجتمع مقاطعه بافاريا المانيه قاد...,الأزمة_السورية,1


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(media_df, test_size = 0.2, random_state = 0)

train, valid = train_test_split(train, test_size = 0.2, random_state = 0)
train, dev = train_test_split(train, test_size = 0.2, random_state = 0)

Y_train = train["syria"].values
Y_dev = dev["syria"].values
Y_valid = valid["syria"].values
Y_test = test["syria"].values

len(train)

14166

In [9]:
#Let's first try with no time-dependent info about the conflict
#so groups that might change their name or disban aren't allowed

provinces = r"ريف دمشق|السويداء|دمشق|طرطوس|درعا|دير الزور|حلب|حماة|الحسكة|حمص|ادلب|القنيطرة|اللاذقية|الرقة"
syria_terms = r"معارض|محرر|نظام|اسد"
regional_players = r"تركي|لبنان|اسرئيل|اردن"
politics = r"سياسي|اتفاق|مفاوضات|وفد|بعثة"
war = r"حرب|اهلي|اطلاق النار|اشتباك|صراع|معارك|اسلاح|سلح"


#Exclusion terms idea didn't work well for oil - but if Syria isn't mentioned at all,
#probably isn't about Syria
@labeling_function()
def syria(x):
    return POS if re.search(r"سوريا|سوري", x.text) and re.search(syria_terms, x.text) else NEG

@labeling_function()
def provinces_mention(x):
    return POS if re.search(provinces, x.text) else ABSTAIN 

@labeling_function()
def regional_politics(x):
    return POS if re.search(r"سوريا|سوري", x.text) and re.search(regional_players, x.text) else ABSTAIN

@labeling_function()
def syria_politics(x):
    return POS if re.search(r"سوريا|سوري", x.text) and re.search(politics, x.text) else ABSTAIN

@labeling_function()
def syria_war(x):
    return POS if re.search(r"سوريا|سوري", x.text) and re.search(war, x.text) else ABSTAIN

In [13]:
lfs = [syria,provinces_mention,regional_politics,
       syria_politics,syria_war]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=train)
L_dev = applier.apply(df=dev)
L_valid = applier.apply(df=valid)
L_test = applier.apply(df=test)




  0%|          | 0/14166 [00:00<?, ?it/s][A[A[A


  3%|▎         | 404/14166 [00:00<00:03, 4034.12it/s][A[A[A


  6%|▌         | 854/14166 [00:00<00:03, 4163.13it/s][A[A[A


  9%|▉         | 1344/14166 [00:00<00:02, 4357.31it/s][A[A[A


 12%|█▏        | 1761/14166 [00:00<00:02, 4296.32it/s][A[A[A


 16%|█▌        | 2201/14166 [00:00<00:02, 4325.60it/s][A[A[A


 18%|█▊        | 2598/14166 [00:00<00:02, 4211.55it/s][A[A[A


 21%|██        | 3003/14166 [00:00<00:02, 4161.20it/s][A[A[A


 24%|██▍       | 3401/14166 [00:00<00:02, 4102.14it/s][A[A[A


 27%|██▋       | 3816/14166 [00:00<00:02, 4116.16it/s][A[A[A


 30%|███       | 4279/14166 [00:01<00:02, 4255.20it/s][A[A[A


 33%|███▎      | 4736/14166 [00:01<00:02, 4341.69it/s][A[A[A


 37%|███▋      | 5189/14166 [00:01<00:02, 4394.38it/s][A[A[A


 40%|███▉      | 5625/14166 [00:01<00:01, 4284.15it/s][A[A[A


 43%|████▎     | 6060/14166 [00:01<00:01, 4303.41it/s][A[A[A


 46%|████▌     | 6500/

In [11]:
from snorkel.labeling.analysis import LFAnalysis

LFAnalysis(L=L_dev, lfs=lfs).lf_summary(Y=Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
syria,0,[1],0.098814,0.098814,0.091756,266,0,0.76
provinces_mention,1,"[0, 1]",1.0,1.0,0.227273,193,368,0.054489
regional_politics,2,"[0, 1]",1.0,1.0,0.227273,180,381,0.050819
syria_politics,3,"[0, 1]",1.0,1.0,0.227273,257,304,0.072558
syria_war,4,"[0, 1]",1.0,1.0,0.227273,320,241,0.090344


In [14]:
#let's see how well we can do with one decent LF and four bad LFs
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.labeling.model import LabelModel

majority_model = MajorityLabelVoter()
Y_pred_train = majority_model.predict(L=L_train)

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=1000, lr=0.001, log_freq=100, seed=123)

majority_acc = majority_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")
label_model_acc = label_model.score(L=L_valid, Y=Y_valid)["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Majority Vote Accuracy:   48.6%
Label Model Accuracy:     90.6%


This is neat and makes sense - the power of snorkel's label model is that weak LFs boost rather than hurt the performance of one decent LF. So let's try MNB and SVM.

In [23]:
#Did some quick testing of different approaches here - bigram counts perform best
#(though not by much)

from sklearn.feature_extraction.text import CountVectorizer

words_train = [row.text for i, row in train.iterrows()]
words_valid = [row.text for i, row in valid.iterrows()]
words_test = [row.text for i, row in test.iterrows()]

vectorizer = CountVectorizer(ngram_range=(2,2))
X_train = vectorizer.fit_transform(words_train)
X_valid = vectorizer.transform(words_valid)
X_test = vectorizer.transform(words_test)

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, Y_train)

predicted = classifier.predict(X_test)
np.mean(predicted == Y_test)            

0.9199494036863028

In [26]:
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss='log').fit(X_train, Y_train)

predicted = classifier.predict(X_test)
np.mean(predicted == Y_test)

0.8930249367546079