## SemEval 2019 Task 4 - Parse XML and load article metadata into dataframe

Jonathan Miller and Negar Adyaniyazdi, VCU, CMSC516, Fall 2018

In [1]:
import xml.etree.cElementTree as et
import pandas as pd

Get file paths and column names for ground truth and article text XML files

In [2]:
DATA_PATH = '../data/'
DATA_RAW_PATH = DATA_PATH + 'raw/'

gt_train_path = DATA_RAW_PATH + 'ground-truth-training-20180831.xml'
gt_val_path = DATA_RAW_PATH + 'ground-truth-validation-20180831.xml'
gt_cols = ['id', 'hyperpartisan', 'bias', 'url', 'labeled-by']

text_train_path = DATA_RAW_PATH + 'articles-training-20180831.xml'
text_val_path = DATA_RAW_PATH + 'articles-validation-20180831.xml'
text_cols = ['id', 'published-at', 'title']

Parse ground truth XML files

In [24]:
tree = et.parse(gt_train_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('hyperpartisan'), article.get('bias'), article.get('url'), article.get('labeled-by')] 
            for article in articles]

gt_train = pd.DataFrame(xml_data, columns=gt_cols)
gt_train['id'] = gt_train['id'].astype(str)

In [25]:
gt_train.head()

Unnamed: 0,id,hyperpartisan,bias,url,labeled-by
0,1,True,right,https://dailywire.com/news/20858/after-devos-a...,publisher
1,2,True,right,https://dailywire.com/news/16103/university-aw...,publisher
2,8,False,right-center,https://newsline.com/texas-state-university-su...,publisher
3,9,False,least,https://apnews.com/795c45f6db904f8eb85d72cee61...,publisher
4,10,True,left,https://dissentmagazine.org/article/eve-and-th...,publisher


In [26]:
gt_train.shape

(800000, 5)

In [27]:
tree = et.parse(gt_val_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('hyperpartisan'), article.get('bias'), article.get('url'), article.get('labeled-by')] 
            for article in articles]

gt_val = pd.DataFrame(xml_data, columns=gt_cols)
gt_val['id'] = gt_val['id'].astype(str)

In [28]:
gt_val.head()

Unnamed: 0,id,hyperpartisan,bias,url,labeled-by
0,17,False,left-center,http://sfgate.com/politics/article/SAN-FRANCIS...,publisher
1,19,True,right,http://madworldnews.com/nancy-pelosi-lies-cnn-...,publisher
2,23,True,right,https://thecollegefix.com/post/32204/,publisher
3,24,False,right-center,https://cfr.org/blog/what-wine-prices-tell-us-...,publisher
4,26,False,right-center,https://the-american-interest.com/2016/01/13/t...,publisher


In [29]:
gt_val.shape

(200000, 5)

Parse Article XML files

In [30]:
tree = et.parse(text_train_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('published-at'), article.get('title')] 
            for article in articles]

text_train = pd.DataFrame(xml_data, columns=text_cols)
text_train['id'] = text_train['id'].astype(str)

In [31]:
text_train.head()

Unnamed: 0,id,published-at,title
0,1,2017-09-10,After DeVos Announced Plans To Reexamine Title...
1,2,2017-05-05,University To Award Trayvon Martin With Posthu...
2,8,2017-11-15,Texas State University suspends Greek life aft...
3,9,2015-06-10,Red Sox waste Rodriguez outing in 1-0 loss to ...
4,10,,Eve and the New Jerusalem


In [32]:
text_train.shape

(800000, 3)

In [33]:
tree = et.parse(text_val_path)
root = tree.getroot()
articles = root.findall('.//article')

xml_data = [[article.get('id'), article.get('published-at'), article.get('title')] 
            for article in articles]

text_val = pd.DataFrame(xml_data, columns=text_cols)
text_val['id'] = text_val['id'].astype(str)

In [34]:
text_val.head()

Unnamed: 0,id,published-at,title
0,17,2004-07-29,SAN FRANCISCO / Head of Juvenile Probation Dep...
1,19,2017-10-05,"Nancy Pelosi Lies On CNN About Gun Control, An..."
2,23,2017-04-20,"University leaders ban pro-life flag display, ..."
3,24,2011-01-14,What Wine Prices Tell Us About OPEC Why Are Oi...
4,26,2016-01-13,The Long Road to Harvard


In [35]:
text_val.shape

(200000, 3)

Merge article and ground truth dataframes on index

In [44]:
md_train = text_train.merge(gt_train, on='id')
md_val = text_val.merge(gt_val, on='id')

Examine test set dataframe and columns

In [45]:
md_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 0 to 799999
Data columns (total 7 columns):
id               800000 non-null object
published-at     686082 non-null object
title            800000 non-null object
hyperpartisan    800000 non-null object
bias             800000 non-null object
url              800000 non-null object
labeled-by       800000 non-null object
dtypes: object(7)
memory usage: 48.8+ MB


In [46]:
md_train['hyperpartisan'].value_counts()

true     400000
false    400000
Name: hyperpartisan, dtype: int64

In [47]:
md_train['bias'].value_counts()

least           221776
left            200000
right           200000
left-center     112995
right-center     65229
Name: bias, dtype: int64

In [48]:
md_train['labeled-by'].value_counts()

publisher    800000
Name: labeled-by, dtype: int64

Write metadata dataframes as csv

In [50]:
DATA_INTERIM_PATH = DATA_PATH + 'interim/'

md_train.to_csv(DATA_INTERIM_PATH + 'md_train.csv', index=False)
md_val.to_csv(DATA_INTERIM_PATH + 'md_val.csv', index=False)

In [49]:
md_train.head()

Unnamed: 0,id,published-at,title,hyperpartisan,bias,url,labeled-by
0,1,2017-09-10,After DeVos Announced Plans To Reexamine Title...,True,right,https://dailywire.com/news/20858/after-devos-a...,publisher
1,2,2017-05-05,University To Award Trayvon Martin With Posthu...,True,right,https://dailywire.com/news/16103/university-aw...,publisher
2,8,2017-11-15,Texas State University suspends Greek life aft...,False,right-center,https://newsline.com/texas-state-university-su...,publisher
3,9,2015-06-10,Red Sox waste Rodriguez outing in 1-0 loss to ...,False,least,https://apnews.com/795c45f6db904f8eb85d72cee61...,publisher
4,10,,Eve and the New Jerusalem,True,left,https://dissentmagazine.org/article/eve-and-th...,publisher
