# Create Articles Data Frame

Load NELA-GT-2018 data from Harvard Dataverse website. Data files should include labels.csv and articles.tar.gz. Pull .txt files from original data source & store in data frame. Save data frame to .csv file.

In [1]:
import pandas as pd
import numpy as np
import os
import re
import tarfile
import urllib.request
import random
from io import BytesIO
import shutil

In [2]:
# download labels.csv from Harvard Dataverse
labels_df = pd.read_csv(
    "https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/ULHLCB/UYCET1"
)

In [3]:
labels_df.head()

Unnamed: 0.1,Unnamed: 0,"NewsGuard, Does not repeatedly publish false content","NewsGuard, Gathers and presents information responsibly","NewsGuard, Regularly corrects or clarifies errors","NewsGuard, Handles the difference between news and opinion responsibly","NewsGuard, Avoids deceptive headlines","NewsGuard, Website discloses ownership and financing","NewsGuard, Clearly labels advertising","NewsGuard, Reveals who's in charge, including any possible conflicts of interest","NewsGuard, Provides information about content creators",...,"Allsides, community_agree","Allsides, community_disagree","Allsides, community_label","BuzzFeed, leaning","PolitiFact, Pants on Fire!","PolitiFact, False","PolitiFact, Mostly False","PolitiFact, Half-True","PolitiFact, Mostly True","PolitiFact, True"
0,21stCenturyWire,,,,,,,,,,...,,,,left,,,,,,
1,ABC News,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,8964.0,6949.0,somewhat agree,,,,,,,
2,AMERICAblog News,,,,,,,,,,...,,,,left,,,,,,
3,Activist Post,,,,,,,,,,...,,,,left,,,,,,
4,Addicting Info,,,,,,,,,,...,,,,left,,,,,,


In [4]:
labels_df.rename({'Unnamed: 0': 'news_source'}, axis = 1, inplace = True)
labels_df.head()

Unnamed: 0,news_source,"NewsGuard, Does not repeatedly publish false content","NewsGuard, Gathers and presents information responsibly","NewsGuard, Regularly corrects or clarifies errors","NewsGuard, Handles the difference between news and opinion responsibly","NewsGuard, Avoids deceptive headlines","NewsGuard, Website discloses ownership and financing","NewsGuard, Clearly labels advertising","NewsGuard, Reveals who's in charge, including any possible conflicts of interest","NewsGuard, Provides information about content creators",...,"Allsides, community_agree","Allsides, community_disagree","Allsides, community_label","BuzzFeed, leaning","PolitiFact, Pants on Fire!","PolitiFact, False","PolitiFact, Mostly False","PolitiFact, Half-True","PolitiFact, Mostly True","PolitiFact, True"
0,21stCenturyWire,,,,,,,,,,...,,,,left,,,,,,
1,ABC News,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,8964.0,6949.0,somewhat agree,,,,,,,
2,AMERICAblog News,,,,,,,,,,...,,,,left,,,,,,
3,Activist Post,,,,,,,,,,...,,,,left,,,,,,
4,Addicting Info,,,,,,,,,,...,,,,left,,,,,,


In [5]:
"""
Define a function to create a generator object containing text file info from articles.tar.gz which can be
passed to the TarFile.extractall() method.

Note that the files follow directory structure and naming convention:
articles/<date>/<source>/<source>--<date>--<title>

Below implementation was inspired by the second example included in the tarfile module examples section:
https://docs.python.org/3/library/tarfile.html#examples
"""

def get_files(members):
    for tarinfo in members:
        name_comps = tarinfo.name.split('/')
        if len(name_comps) == 4:
            #if (name_comps[1] == "2018-03-25") & (name_comps[2] == "Daily Beast"):
            yield tarinfo

In [6]:
"""
Define a function that can take a random sample from a generator object. This will allow us to take a random
sample of the article files in the NELA-GT-2018 dataset, which includes over 720,000 articles.

Implementation taken from the following StackOverflow post:
https://stackoverflow.com/questions/12581437/python-random-sample-with-a-generator-iterable-iterator
"""

def random_sample_iter(iterable, sample_size):
    iterator = iter(iterable)
    sample_results = []
    
    # add first sample_size items from iterator to list that will contain random sample
    for i in range(sample_size):
        sample_results.append(next(iterator))
        
    # randomly shuffle the list of sample_size items
    random.shuffle(sample_results)
    
    # enumerate the iterator starting at index sample_size
    # (the first index not added to the random sample list)
    for k, v in enumerate(iterator, sample_size):
        rn = random.randint(0, k) # random integer in [0, sampleSize + m], where m = 0, ..., len(iterator)
        # NOTE: This ensures that the probability of an element in the sample_results list being replaced
        # with a randomly selected element from the iterator decreases over time.
        
        if rn < sample_size: # if the random integer r is less than the sample size...
            # replace the item in the random sample list at index r with element v from the iterator
            sample_results[rn] = v
            
    #return sample_results
    for el in sample_results:
        yield el

In [7]:
"""
Implementation adjusted from these code snippets:
https://gist.github.com/devhero/8ae2229d9ea1a59003ced4587c9cb236
https://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed/18624269
"""

# set number of articles to download
k = 500

# download articles.tar.gz from Harvard Dataverse
file_url = 'https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/ULHLCB/MWLJVN'

# connect to URL
ftpstream = urllib.request.urlopen(file_url)

# create temporary file
tmpfile = BytesIO()

while True:
    # download a piece of the file from the connection
    s = ftpstream.read(16384)

    # once the entire file has been downloaded, tarfile returns b''
    # (the empty bytes) which is a false value
    if not s:  
        break

    # otherwise, write the piece of the file to the temporary file.
    tmpfile.write(s)

# close the FTP stream
ftpstream.close()

# Now that the FTP stream has been downloaded to the temporary file,
# we can ditch the FTP stream and have the tarfile module work with
# the temporary file. Begin by seeking back to the beginning of the
# temporary file.
tmpfile.seek(0)

# open file
file = tarfile.open(fileobj = tmpfile,
                    mode = "r:gz")

# randomly select k files from articles.tar.gz
members = random_sample_iter(get_files(file), k)

# download k selected files to working directory
file.extractall(members = members)

# close file
file.close()

# close temporary file
tmpfile.close()

In [8]:
news_source = []
pub_date = []
article_title = []
article_text = []

rootDir = 'articles'
for dirName, subdirList, fileList in os.walk(rootDir):
    for fName in fileList:
        art_info = fName.split('--')
        news_source.append(art_info[0])
        pub_date.append(art_info[1])
        article_title.append(art_info[2])
        
        currFile = dirName + '/' + fName
        
        with open(currFile, 'r') as f:
            currText = f.read()
        
        article_text.append(currText)

In [9]:
articles_df = pd.DataFrame(
    list(zip(news_source,
             pub_date,
             article_title,
             article_text)),
    columns = ['news_source', 'pub_date', 'title', 'text']
)

In [10]:
articles_df.head()

Unnamed: 0,news_source,pub_date,title,text
0,The Gateway Pundit,2018-07-14,REPORT House Conservatives Prepare to Impeach ...,House GOP lawmakers are preparing to push to i...
1,oann,2018-03-24,French policeman who took place of hostage die...,PARIS (Reuters) A gendarme who was shot three...
2,New York Daily News,2018-03-24,Attorney for Roy Moore accuser was offered 10G...,"An attorney for Leigh Corfman, a woman who acc..."
3,Sputnik,2018-03-23,Martin Vizcarra is New Peruvian President Afte...,Martin Vizcarra is sworn in as Peruvian presid...
4,oann,2018-04-02,Oil falls 2 percent on Russia output rise pote...,NEW YORK (Reuters) Oil fell by more than 2 pe...


In [None]:
articles_df.shape

In [11]:
articles_df[(articles_df['news_source'] != "") & (articles_df['title'] != "") & (articles_df['text'] != "")].shape

(500, 4)

In [21]:
articles_df = articles_df[(articles_df['news_source'] != "") & (articles_df['title'] != "") & (articles_df['text'] != "")]
articles_df.reset_index(drop = True, inplace = True)

(498, 4)

In [22]:
# save data frame of 498 articles to csv
articles_df.to_csv('articles_df.csv')

In [25]:
# remove article files from working directory
shutil.rmtree('./articles')