## SemEval 2019 Task 4 - Parse XML and load article metadata into dataframe

Jonathan Miller and Negar Adyaniyazdi, VCU, CMSC516, Fall 2018

In [1]:
import xml.etree.cElementTree as et
import pandas as pd

Get file paths and column names for ground truth and article text XML files

In [2]:
DATA_PATH = '../data/'
DATA_RAW_PATH = DATA_PATH + 'raw/'

gt_train_path = DATA_RAW_PATH + 'ground-truth-training-20180831.xml'
gt_val_path = DATA_RAW_PATH + 'ground-truth-validation-20180831.xml'
gt_cols = ['id', 'hyperpartisan', 'bias', 'url', 'labeled-by']

text_train_path = DATA_RAW_PATH + 'articles-training-20180831.xml'
text_val_path = DATA_RAW_PATH + 'articles-validation-20180831.xml'
text_cols = ['id', 'published-at', 'title']

Parse ground truth XML files

In [3]:
tree = et.parse(gt_train_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('hyperpartisan'), article.get('bias'), article.get('url'), article.get('labeled-by')] 
            for article in articles]

gt_train = pd.DataFrame(xml_data, columns=gt_cols)
gt_train.set_index('id', inplace=True)

In [4]:
gt_train.head()

Unnamed: 0_level_0,hyperpartisan,bias,url,labeled-by
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,True,right,https://dailywire.com/news/20858/after-devos-a...,publisher
2,True,right,https://dailywire.com/news/16103/university-aw...,publisher
8,False,right-center,https://newsline.com/texas-state-university-su...,publisher
9,False,least,https://apnews.com/795c45f6db904f8eb85d72cee61...,publisher
10,True,left,https://dissentmagazine.org/article/eve-and-th...,publisher


In [5]:
gt_train.shape

(800000, 4)

In [6]:
tree = et.parse(gt_val_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('hyperpartisan'), article.get('bias'), article.get('url'), article.get('labeled-by')] 
            for article in articles]

gt_val = pd.DataFrame(xml_data, columns=gt_cols)
gt_val.set_index('id', inplace=True);

In [7]:
gt_val.head()

Unnamed: 0_level_0,hyperpartisan,bias,url,labeled-by
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
17,False,left-center,http://sfgate.com/politics/article/SAN-FRANCIS...,publisher
19,True,right,http://madworldnews.com/nancy-pelosi-lies-cnn-...,publisher
23,True,right,https://thecollegefix.com/post/32204/,publisher
24,False,right-center,https://cfr.org/blog/what-wine-prices-tell-us-...,publisher
26,False,right-center,https://the-american-interest.com/2016/01/13/t...,publisher


In [8]:
gt_val.shape

(200000, 4)

Parse Article XML files

In [9]:
tree = et.parse(text_train_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('published-at'), article.get('title')] 
            for article in articles]

text_train = pd.DataFrame(xml_data, columns=text_cols)
text_train.set_index('id', inplace=True)

In [10]:
text_train.head()

Unnamed: 0_level_0,published-at,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2017-09-10,After DeVos Announced Plans To Reexamine Title...
2,2017-05-05,University To Award Trayvon Martin With Posthu...
8,2017-11-15,Texas State University suspends Greek life aft...
9,2015-06-10,Red Sox waste Rodriguez outing in 1-0 loss to ...
10,,Eve and the New Jerusalem


In [11]:
text_train.shape

(800000, 2)

In [12]:
tree = et.parse(text_val_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('published-at'), article.get('title')] 
            for article in articles]

text_val = pd.DataFrame(xml_data, columns=text_cols)
text_val.set_index('id', inplace=True)

In [13]:
text_val.head()

Unnamed: 0_level_0,published-at,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
17,2004-07-29,SAN FRANCISCO / Head of Juvenile Probation Dep...
19,2017-10-05,"Nancy Pelosi Lies On CNN About Gun Control, An..."
23,2017-04-20,"University leaders ban pro-life flag display, ..."
24,2011-01-14,What Wine Prices Tell Us About OPEC Why Are Oi...
26,2016-01-13,The Long Road to Harvard


In [14]:
text_val.shape

(200000, 2)

Merge article and ground truth dataframes on index

In [15]:
md_train = text_train.merge(gt_train, left_index=True, right_index=True)
md_val = text_val.merge(gt_val, left_index=True, right_index=True)

Examine test set dataframe and columns

In [16]:
md_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 800000 entries, 0000001 to 1494892
Data columns (total 6 columns):
published-at     686082 non-null object
title            800000 non-null object
hyperpartisan    800000 non-null object
bias             800000 non-null object
url              800000 non-null object
labeled-by       800000 non-null object
dtypes: object(6)
memory usage: 62.7+ MB


In [17]:
md_train['hyperpartisan'].value_counts()

true     400000
false    400000
Name: hyperpartisan, dtype: int64

In [18]:
md_train['bias'].value_counts()

least           221776
left            200000
right           200000
left-center     112995
right-center     65229
Name: bias, dtype: int64

In [19]:
md_train['labeled-by'].value_counts()

publisher    800000
Name: labeled-by, dtype: int64

Create new domain column to identify the news source which published an article. Use regular expressions on URL column

In [20]:
import tldextract

md_train['domain'] = md_train['url'].apply(lambda x: tldextract.extract(x)[1])
md_val['domain'] = md_val['url'].apply(lambda x: tldextract.extract(x)[1])

In [21]:
md_train['domain'].value_counts()

foxbusiness                  116106
abqjournal                    93853
apnews                        75725
pri                           51781
newsline                      47593
counterpunch                  42410
motherjones                   38487
truthdig                      33780
dailywire                     21538
thedailybeast                 19560
poynter                       17276
baptistnews                   13728
nbcnews                       12829
reuters                       10914
consortiumnews                 9593
washingtonblade                8265
calwatchdog                    7611
thegoldwater                   6947
natmonitor                     6556
dissentmagazine                6292
politicalillusionsexposed      5714
chicagoreporter                5316
ivn                            4967
eppc                           4652
mintpressnews                  4494
factcheck                      4485
billmoyers                     4328
circa                       

Write metadata dataframes as csv

In [22]:
DATA_PROCESSED_PATH = DATA_PATH + 'processed/'

md_train.to_csv(DATA_PROCESSED_PATH + 'md_train.csv')
md_val.to_csv(DATA_PROCESSED_PATH + 'md_val.csv')