# Neural-based propaganda detection
Propaganda is the new weapon that influences people's opinions or beliefs about a certain ideology, whether that ideology is right or wrong.

## 1. loading Data

In [27]:
import pandas as pd
from sklearn.utils import shuffle
df = pd.read_table('train.tsv')
df = shuffle(df) # randomly shuffle data entries
df

Unnamed: 0,article_id,article_title,label,sentence_text
9214,765953146,'Textbook Definition of Bias',non-propaganda,On Tuesday Horowitz performed solo before a jo...
5963,764609985,"Clinton Email IG Report Rips FBI, Comey, & Lyn...",non-propaganda,"“[Trump’s] not ever going to become president,..."
9925,705409419,﻿Vatican Theologian Sacked for Questioning “Me...,non-propaganda,Weinandy’s very point about fearfulness and la...
1825,756114837,WHO Prepares For “Worst Case” As Congo Ebola O...,non-propaganda,-Daily Mail
9078,755814432,Trump’s Plan for Iran: Put Terrorists in Charge?,non-propaganda,Whom do you consider to be the most corrupt De...
...,...,...,...,...
3895,706600938,Did Saint Francis Predict Pope Francis?,propaganda,But he had no evidence that the quotation as s...
5336,728972961,FOR THE FIRST TIME ONLINE: Archbishop Lefebvre...,non-propaganda,But still they come.
11,775448623,3-D-printed guns put carnage a click away,propaganda,"So it was stunning — but not surprising, given..."
3969,696264594,The American Jewish Historical Society Hosts D...,non-propaganda,The American Jewish Historical Society was fou...


## 2. Pre - Analysis on  data

In [28]:
raw_labels = df.label.values.tolist()
docs = df.sentence_text.values.tolist()
titles = df.article_title.values.tolist()

label_dic = {'non-propaganda':0, 'propaganda':1}

assert len(docs) == len(raw_labels) == len(titles)
labels = [label_dic[rl] for rl in raw_labels] # transfer raw labels (strings) to integer numbers
print('total data size: {}, label type num: {}'.format(len(docs), len(label_dic)))

total data size: 11464, label type num: 2


In [29]:
# take a look at some sentences in the dataset
print(docs[19])
print(titles[19])
print(labels[19])

For some, it seems, no price is too great—not even the good of the Church—to avoid the personal ignominy of being seen on the wrong side of history.
Archbishop Viganò Speaks, the Neo-Catholics Panic
0


## 3. Splitting Data

In [30]:
# split the data into train, dev and test

train_ratio, dev_ratio, test_ratio = 0.6, 0.2, 0.2
train_docs = docs[:int(len(docs)*train_ratio)]
train_labels = labels[:int(len(docs)*train_ratio)]

dev_docs = docs[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]
dev_labels = labels[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]

test_docs = docs[-int(len(docs)*(test_ratio)):]
test_labels = labels[-int(len(docs)*(test_ratio)):]

print('train size {}, dev size {}, test size {}'.format(len(train_labels), len(dev_labels), len(test_labels)))

train size 6878, dev size 2293, test size 2292


## 4. Loading the Glove Embedding
#### The Glove embedding considers every words as a single entity and creates a vector for each word. So i prefer to stick with Glove.

In [31]:
# load the glove pre-trained embedding
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

path_of_downloaded_files = "/MLOps/2.Propaganda_detection/glove.6B.300d.txt"
glove_file = datapath(path_of_downloaded_files)
word2vec_glove_file = get_tmpfile("glove.6B.300d.txt")
glove2word2vec(glove_file, word2vec_glove_file)
word_vectors = KeyedVectors.load_word2vec_format(word2vec_glove_file)

  glove2word2vec(glove_file, word2vec_glove_file)


## 5. Vectorize (OOV) Words

In [32]:
from nltk.tokenize import word_tokenize
import numpy as np

word_vec_dim = 300                    # make sure this number matches the embedding you use
oov_vec = np.random.rand(word_vec_dim) 
def vectorize_sent(word_vectors, sent):
    word_vecs = []
    for token in word_tokenize(sent): 
        if token not in word_vectors: 
            word_vecs.append(oov_vec)
        else:
            word_vecs.append(word_vectors[token].astype('float64'))
    return np.mean(word_vecs,axis=0)

vv = vectorize_sent(word_vectors, 'hello world ! this is a test sentence !')


In [33]:
# create vector representations; 
# TODO: consider to apply necessary text cleaning/normalization techniques
# TODO: consider whether to use titles information (the example below does not use titles but only sentences)


train_vecs = np.array([vectorize_sent(word_vectors, ss) for ss in train_docs])
dev_vecs = np.array([vectorize_sent(word_vectors, ss) for ss in dev_docs])


print(train_vecs.shape)

(6878, 300)


In [34]:
# define a simple MLP (multi-layer perceptron) as the classifation model
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_dim, out_dim, dp_rate):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_dim, input_dim*2)
        self.output_layer = nn.Linear(input_dim*2, out_dim)
        self.dropout = nn.Dropout(dp_rate)
        self.relu = torch.nn.ReLU()
       
    def forward(self, x_in):
        z1 = self.dropout(x_in) # output of the input layer, after dropout
        z2 = self.relu(self.hidden_layer(z1)) # output of the hidden layer
        logits = self.output_layer(z2)
        return logits

In [35]:
# build model
dropout_rate = 0.5 
model = MLP(word_vec_dim,len(label_dic),dropout_rate) 
loss_fnc = torch.nn.CrossEntropyLoss()

# hyper parameters
n_epochs = 50 # number of epoch (i.e. number of iterations)
batch_size = 32 # mini batch size
lr = 0.001 # initial learning rate

# initialize optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.9) # decays the learning rate of each parameter group by gamma every step_size epochs.

In [36]:
best_f1 = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

for epoch_i in range(n_epochs):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    for idx in range(0,len(train_vecs),batch_size):
        # Step 0: Get the data
        x_data = torch.tensor(train_vecs[idx:idx+batch_size], dtype=torch.float)
        if x_data.shape[0] == 0: continue
        y_target = torch.tensor(train_labels[idx:idx+batch_size], dtype=torch.int64)

        # Step 1: Clear the gradients
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = model(x_data)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()

    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        dev_data = torch.tensor(dev_vecs, dtype=torch.float)
        dev_target = torch.tensor(dev_labels, dtype=torch.int64)
        dev_prediction = model(dev_data)
        pred_labels = [np.argmax(dp.numpy()) for dp in dev_prediction]
        pre, rec, f1, _ = precision_recall_fscore_support(dev_target, pred_labels, average='macro')
        print('\n---> after epoch {} the macro-f1 on dev set is {}'.format(epoch_i, f1))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])

        # save the best model
        if f1 > best_f1:
            best_f1 = f1
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best f1',f1)

    # (optional) adjust learning rate according to the scheduler
    scheduler.step()



---> after epoch 0 the macro-f1 on dev set is 0.45486502427294584
learning rate 0.001
best model updated; new best f1 0.45486502427294584

---> after epoch 1 the macro-f1 on dev set is 0.4644324053344271
learning rate 0.001
best model updated; new best f1 0.4644324053344271

---> after epoch 2 the macro-f1 on dev set is 0.46333100967008783
learning rate 0.001

---> after epoch 3 the macro-f1 on dev set is 0.521627206856991
learning rate 0.001
best model updated; new best f1 0.521627206856991

---> after epoch 4 the macro-f1 on dev set is 0.5512225881481505
learning rate 0.001
best model updated; new best f1 0.5512225881481505

---> after epoch 5 the macro-f1 on dev set is 0.5691037558567813
learning rate 0.001
best model updated; new best f1 0.5691037558567813

---> after epoch 6 the macro-f1 on dev set is 0.5690961164035612
learning rate 0.001

---> after epoch 7 the macro-f1 on dev set is 0.5941469071378437
learning rate 0.001
best model updated; new best f1 0.5941469071378437

--->

In [37]:
# test on the test set

# load the best model weights
model.load_state_dict(best_model)
test_vecs = np.array([vectorize_sent(word_vectors, ss) for ss in test_docs])

with torch.no_grad():
    model.eval()
    test_data = torch.tensor(test_vecs, dtype=torch.float)
    test_target = torch.tensor(test_labels, dtype=torch.int64)
    test_prediction = model(test_data)
    pred_labels = [np.argmax(dp.numpy()) for dp in test_prediction]
    pre, rec, f1, _ = precision_recall_fscore_support(test_target, pred_labels, average='macro')
    print('macro-f1 on test data', f1)

macro-f1 on test data 0.6744302398301264


## SAVE YOUR TRAINED MODEL

In [38]:
import pickle

# save model and other necessary components of your model
# DO NOT include the embedding files in your submission

all_info_want_to_save = {
    'input_dim': word_vec_dim,
    'dropout_rate': dropout_rate,
    'neural_weights': best_model,
    'oov_vector': oov_vec
}
save_path = open("propoganda.pickle","wb")
pickle.dump(all_info_want_to_save, save_path)
save_path.close()