In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import codecs
import glob
import numpy as np
import pandas as pd
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.model_selection import train_test_split

In [None]:
##update the project directort here before running the file

project_dir="/content/drive/MyDrive/project_5_data"
data_dir=os.path.join(project_dir, "datasets")
scorer_path=os.path.join(project_dir,"propaganda-techniques-scorer")
propaganda_techniques_names=os.path.join(scorer_path,"data/propaganda-techniques-names-semeval2020task11.txt")

In [None]:





def read_articles(folder_name, file_extension="*.txt"):
    '''Function to read articles from the given folders '''
    articles = {}
    #article_id_list, sentence_id_list, sentence_list = ([], [], [])
    filename_list = glob.glob(os.path.join(folder_name, file_extension))
    for file_id in sorted(filename_list):
        art_id = os.path.basename(file_id).split(".")[0][7:]
        with codecs.open(file_id, "r", encoding="utf8") as a:
            articles[art_id] = a.read()
    return articles




def read_span_prediction(filename):
    '''Function to read span prediction from the given file '''
    articles_id, start_span, end_span, golden_labels = ([], [], [], [])
    with open(filename, "r") as f:
        for row in f.readlines():
            art_id, golden_label, start, end = row.rstrip().split("\t")
            articles_id.append(art_id)
            start_span.append(start)
            end_span.append(end)
            golden_labels.append(golden_label)
            
    return articles_id, start_span, end_span, golden_labels

def data_loader(data_folder, data_labels):
    '''Function to load data and data labels '''
    articles = read_articles(data_folder)
    art_id, start_span, end_span, labels = read_span_prediction(data_labels)
    return articles, art_id, start_span, end_span, labels



# def clear(text):
#     return text.strip().replace('\t', ' ').replace('\n', ' ')
def remove(text):
    '''Function to replace \t and \n with space '''
    return text.strip().replace('\t', ' ').replace('\n', ' ')

def tok_bound(text):
    sent = []
    for start, end in PunktSentenceTokenizer().span_tokenize(text):
        sent.append(start)
    sent.append(100000)
    return np.array(sent)



def context(article, start_span, end_span):
    '''Function to get the context'''
    b = tok_bound(article)
    s = b[np.where(b <= start_span)[0][-1]]
    e = b[np.where(b >= end_span)[0][0]]
    return remove(article[s:e])


def create_dataframe(articles, art_id, start_spans, end_spans, golden_labels):
    '''Function to create pandas dataframe'''
    dataframe = pd.DataFrame.from_dict({'article_id': art_id, 
              'article': [articles[id] for id in art_id], 
              'start_span': np.array(start_spans).astype(int), 
              'end_span': np.array(end_spans).astype(int),
              'label': golden_labels
             })
    
    dataframe['data_context'] = dataframe.apply(lambda x: context(x['article'], x['start_span'], x['end_span']), axis=1)
    dataframe['data_span'] = dataframe.apply(lambda x: remove(x['article'][x['start_span']:x['end_span']]), axis=1)
    return dataframe[['article_id', 'start_span', 'end_span', 'data_span', 'data_context', 'label']]


def balance_dataset(dataset):
    '''Function to balance dataset'''
    data_list = [dataset]
    max_cnt = dataset['label'].value_counts().max()
    for id, group in dataset.groupby('label'):
        data_list.append(group.sample(max_cnt - len(group), replace=True))
    return pd.concat(data_list)








In [None]:
def create_train_dev_files(articles, art_id, start_spans, end_spans, golden_labels, train_file, dev_file,
                     split_by_ids=False, dev_size=0.3, random_state=40, balance=False, shuffle=True):
    '''Function to generate train and dev files'''
    dataframe = create_dataframe(articles, art_id, start_spans, end_spans, golden_labels)
    if split_by_ids:
        train_ids, dev_ids = train_test_split(dataframe.article_id.unique(), test_size=dev_size, random_state=random_state)
        train_data = dataframe[dataframe.article_id.isin(train_ids)]
        dev_data = dataframe[dataframe.article_id.isin(dev_ids)]
    else:
        train_data, dev_data = train_test_split(dataframe, test_size=dev_size, random_state=random_state)
        
    if balance:
        train_data = balance_dataset(train)
    if shuffle:
        train_data = train_data.sample(frac=1).reset_index(drop=True)
    
    # save_data_to_csv(train, train_file)
    # save_data_to_csv(dev, dev_file)
    train_data.to_csv(train_file, sep='\t', index=False)
    dev_data.to_csv(dev_file, sep='\t', index=False)

    

    

In [None]:
def create_test_file(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, test_file):
    '''Function to generate test files'''
    test_data = create_dataframe(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels)
    #save_data_to_csv(test, test_file)
    test_data.to_csv(test_file, sep='\t', index=False)

In [None]:
train_folder=os.path.join(data_dir, "train-articles")
train_labels=os.path.join(data_dir, "train-task-flc-tc.labels")

articles, ref_articles_id, ref_span_starts, ref_span_ends, labels=data_loader(train_folder, train_labels)

In [None]:
dev_folder=os.path.join(data_dir, "dev-articles")
dev_labels=os.path.join(data_dir, "dev-task-flc-tc.labels")

import spacy
nlp = spacy.blank("en") # load a new spacy model

In [None]:
articles_dev, ref_articles_id_dev, ref_span_starts_dev, ref_span_ends_dev, labels_dev=data_loader(dev_folder, dev_labels)

In [None]:
import pandas as pd

def dev_dataframe(articles_dev, ref_articles_id_dev, ref_span_starts_dev, ref_span_ends_dev):
  dev_df=pd.DataFrame()
  cols=['art_id','starts','ends','text']

  dev_df[cols]=cols
  art_ids=[]
  starts=[]
  ends=[]
  for i,id in enumerate(ref_articles_id_dev):
    # text=articles_dev[id]
    art_ids.append(id)
    starts.append(ref_span_starts_dev[i])
    ends.append(ref_span_ends_dev[i])
  dev_df['art_id']=art_ids
  dev_df['starts']=starts
  dev_df['ends']=ends
  #print(dev_df)
  return dev_df

dev_df=dev_dataframe(articles_dev, ref_articles_id_dev, ref_span_starts_dev, ref_span_ends_dev)
print(dev_df)
# dev_df=pd.DataFrame()
# cols=['art_id','starts','ends','text']

# dev_df[cols]=cols
# art_ids=[]
# starts=[]
# ends=[]
# for i,id in enumerate(ref_articles_id_dev):
#   # text=articles_dev[id]
#   art_ids.append(id)
#   starts.append(ref_span_starts_dev[i])
#   ends.append(ref_span_ends_dev[i])
# dev_df['art_id']=art_ids
# dev_df['starts']=starts
# dev_df['ends']=ends
# print(dev_df)



  




         art_id starts  ends text
0     730093263    123   128  NaN
1     730093263    352   357  NaN
2     730093263   1370  1393  NaN
3     730093263   2434  2439  NaN
4     730093263   2699  2807  NaN
...         ...    ...   ...  ...
1058  999001419   4828  4851  NaN
1059  999001419    383   397  NaN
1060  999001419   1244  1261  NaN
1061  999001419   1319  1334  NaN
1062  999001419   3641  3657  NaN

[1063 rows x 4 columns]


In [None]:
def get_text_from_span(articles_dev,art_ids,starts,ends):
  sentences=[]
  for i,art_id in enumerate(art_ids):
    article=articles_dev[art_id]
    sentence = article[int(starts[i]):int(ends[i])]
    sentences.append(sentence)
  assert(len(sentences)==len(starts))
  return sentences


    


In [None]:
#sents=get_text_from_span(articles_dev,art_ids,starts,ends)
sents=get_text_from_span(articles_dev,dev_df['art_id'],dev_df['starts'],dev_df['ends'])

In [None]:
dev_df['text']=sents

In [None]:
dev_df.head()


Unnamed: 0,art_id,technique,starts,ends,text
0,730093263,Appeal_to_Authority,123,128,white
1,730093263,Loaded_Language,352,357,black
2,730093263,Doubt,1370,1393,“true American heroes.”
3,730093263,Loaded_Language,2434,2439,black
4,730093263,"Bandwagon,Reductio_ad_hitlerum",2699,2807,"If these two men had survived, and Quentin Lam..."


In [None]:

create_train_dev_files(articles, ref_articles_id, ref_span_starts, ref_span_ends, labels, 'train_tc.tsv', 'dev_tc.tsv',
                     split_by_ids=False, dev_size=0.3, random_state=40, balance=False, shuffle=True)

In [None]:
df=pd.read_csv('train_tc.tsv',sep='\t')

In [None]:
df.head(10)

Unnamed: 0,article_id,start_span,end_span,data_span,data_context,label
0,752287274,7552,7607,if I ever call the police again [he] will do h...,[Imran] threatened that he is very powerful an...,Appeal_to_fear-prejudice
1,757843275,3263,3286,this year’s flu vaccine,"For their sake, let’s hope it works better tha...","Whataboutism,Straw_Men,Red_Herring"
2,789370909,6861,6881,the Kavanaugh haters,Ford’s sexual assault allegation could be read...,"Name_Calling,Labeling"
3,721890296,2456,2567,The reason why people are in your country is n...,“The reason why people are in your country is ...,Black-and-White_Fallacy
4,766632016,455,470,massacring Jews,"In 1013, Berbers from North Africa entered Spa...",Loaded_Language
5,111111133,1070,1077,insults,Trump often lobs insults at journalists and en...,Loaded_Language
6,765385479,6205,6225,audaciously revealed,Katrin Axelsson and Lisa Longstaff of Women Ag...,Loaded_Language
7,776049384,2165,2173,Molester,"#bromance And talk him up, McCarrick the Moles...",Repetition
8,769962328,3274,3311,"counterfeits, self-proclaimed leaders",I told her that there are too many counterfeit...,"Name_Calling,Labeling"
9,776345502,3911,3965,"one of the most successful in history, is a TO...","‘Russian Collusion with the Trump Campaign, on...","Exaggeration,Minimisation"


In [None]:
g=[]
for i in df['data_span']:
    g.append(i)
maxl = max([len(s) for s in g])
print ('Maximum sequence length in the list of sentences:', maxl)

Maximum sequence length in the list of sentences: 577


In [None]:

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=50000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['data_span'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6922 unique tokens.


In [None]:
from keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(df['data_span'].values)
X = pad_sequences(X, maxlen=maxl)

Y = pd.get_dummies(df['label'],columns=df["label"]).values


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.1, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)



(3860, 577) (3860, 14)
(429, 577) (429, 14)


In [None]:
Y[0]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

model=Sequential()
model.add(Embedding(50000,100,input_length=maxl))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(14, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 577, 100)          5000000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 577, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 14)                1414      
                                                                 
Total params: 5,081,814
Trainable params: 5,081,814
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train, Y_train, epochs=15, batch_size=64,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15


In [None]:
accuracy = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accuracy[0],accuracy[1]))

Test set
  Loss: 2.480
  Accuracy: 0.492


In [None]:
labels = list(set(df['label']))
print(labels)
print(len(labels))



['Thought-terminating_Cliches', 'Flag-Waving', 'Whataboutism,Straw_Men,Red_Herring', 'Causal_Oversimplification', 'Appeal_to_fear-prejudice', 'Exaggeration,Minimisation', 'Bandwagon,Reductio_ad_hitlerum', 'Doubt', 'Appeal_to_Authority', 'Loaded_Language', 'Slogans', 'Black-and-White_Fallacy', 'Repetition', 'Name_Calling,Labeling']
14


In [None]:
# new_complaint = ["no government operation ever goes perfectly	But no government operation ever goes perfectly."]

def get_pred(text):

  seq = tokenizer.texts_to_sequences([text])
  padded = pad_sequences(seq, maxlen=maxl)
  pred = model.predict(padded)
  labels = list(set(df['label']))
  #print(pred, labels[np.argmax(pred)])
  return labels[np.argmax(pred)]

In [None]:
def create_prediction_list(sents):
  out=[]
  for sent in sents:
    out.append(get_pred(sent))
  return out    

output=create_prediction_list(sents)

  



In [None]:
##for each text file as input get the start,end indices of each span identified in that text and perform prediciton

dev_df.head()
dev_df=dev_df.drop(columns=['text'])
dev_df['technique']=output

In [None]:
dev_df = dev_df.reindex(['art_id','technique','starts','ends'], axis=1)


In [None]:
dev_df.to_csv('output_tc_baseline.tsv', sep='\t', index=False,header=False)


In [None]:
# sys.path.insert(0,"/content")

In [None]:




import sys
import argparse
import logging.handlers
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import src.annotation as an
import src.annotations as ans
import src.propaganda_techniques as pt

logger = logging.getLogger("propaganda_scorer")
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.setLevel(logging.INFO)


def main():

    user_submission_file = "/content/output_tc_baseline.tsv"
    gold_file = "/content/drive/MyDrive/project_5_data/datasets/dev-task-flc-tc.labels"
    # output_log_file = args.log_file
    propaganda_techniques_list_file = propaganda_techniques_names
    # output_for_script = bool(args.output_for_script)

    # if not output_for_script:
    #     logger.addHandler(ch)

    # if args.debug_on_std:
    #     ch.setLevel(logging.DEBUG)

    # if output_log_file is not None:
    #     logger.info("Logging execution to file " + output_log_file)
    #     fileLogger = logging.FileHandler(output_log_file)
    #     fileLogger.setLevel(logging.DEBUG)
    #     fileLogger.setFormatter(formatter)
    #     logger.addHandler(fileLogger)

    propaganda_techniques = pt.Propaganda_Techniques(propaganda_techniques_list_file)
    an.Annotation.set_propaganda_technique_list_obj(propaganda_techniques)

    user_annotations = ans.Annotations()
    user_annotations.load_annotation_list_from_file(user_submission_file)
    for article in user_annotations.get_article_id_list():
        user_annotations.get_article_annotations_obj(article).sort_spans()

    gold_annotations = ans.Annotations()
    gold_annotations.load_annotation_list_from_file(gold_file)
    for article in gold_annotations.get_article_id_list():
        gold_annotations.get_article_annotations_obj(article).sort_spans()

    logger.info("Checking format: User Predictions -- Gold Annotations")
    if not user_annotations.compare_annotations_identical_article_lists(gold_annotations) or not user_annotations.compare_annotations_identical(gold_annotations):
        logger.error("wrong format, no scoring will be performed")
        sys.exit()
    logger.info("OK: submission file format appears to be correct")
    res_for_output, res_for_script = user_annotations.TC_score_to_string(gold_annotations, True)
    # res_for_output, res_for_script = user_annotations.TC_score_to_string(gold_annotations, output_for_script)

    
    logger.info("Scoring submission" + res_for_output)
    # if output_for_script:
    print(res_for_script)
    print(res_for_output)


if __name__ == "__main__":

    # parser = argparse.ArgumentParser("Scorer for SemEval 2020 Task 11 subtask TC.\n" +
    # "Example: python3 task-TC_scorer.py -s data/submission-task-TC.tsv -r data/article736757214.task-FLC.labels -p data/propaganda-techniques-names-semeval2020task11.txt")

    # parser.add_argument('-s', '--submission-file', dest='submission', required=True, help="file with the submission of the team")
    # parser.add_argument('-r', '--reference-file', dest='gold', required=True, help="file with the gold labels.")
    # parser.add_argument('-d', '--enable-debug-on-standard-output', dest='debug_on_std', required=False,
    #                     action='store_true', help="Print debug info also on standard output.")
    # parser.add_argument('-l', '--log-file', dest='log_file', required=False, help="Output logger file.")
    # parser.add_argument('-p', '--propaganda-techniques-list-file', dest='propaganda_techniques_list_file', required=True, 
    #                     help="file with list of propaganda techniques (one per line).")
    # parser.add_argument('-o', '--output-for-script', dest='output_for_script', required=False, action='store_true',
    #                     default=False, help="Prints the output in a format easy to parse for a script")
    # main(parser.parse_args())
    main()
