### Data Processing

<small>Written by: Ali Tobah - tobah@umich.edu</small>

In [1]:
import pandas as pd
import os
import re
import altair as alt

In [3]:
# Uncomment if needed to unzip the data
# Should have the zip file uploaded first
#!unzip 'RadaMihalcea-fakeNewsDatasets.zip'

In [4]:
dirPath = 'data/raw/fake_real'
dirPathSubList = ['/fake_news_dataset/fake',
               '/fake_news_dataset/legit',
               '/celebrity_dataset/fake',
               '/celebrity_dataset/legit']

columnLabels = ['Text', 'Domain', 'Label']
allArticlesList = []
for eaDir in dirPathSubList:
    eaDir = dirPath + eaDir
    allDirsContents = []
    for eaFile in os.listdir(eaDir):
        with open(os.path.join(eaDir, eaFile), 'r') as currFile:
            # File contents, including a title if there is one
            fileText = currFile.read()

            # Get the domain: 'celebrity' or first part
            # of the file name
            if eaDir.split('/')[3] == 'celebrity_dataset':
                fileDomain = 'celebrity'
            else:
                fileDomain = re.split(r'(\d+)', eaFile)[0]

            # Label, whether it is fake or legit (real)
            fileLabel = eaDir.split('/')[-1]

            # Compile list of directory contents: Text plus attributes
            allDirsContents.append([fileText, fileDomain, fileLabel])
    
    # Compile list of all articles with attributes
    allArticlesList.extend(allDirsContents)

# Create a dataframe and save in a file
articlesDF = pd.DataFrame(allArticlesList, columns=columnLabels)
outFile = dirPath + 'fake_real_allnewsdata.csv'
articlesDF.to_csv(outFile)
    

In [None]:
articlesDF.shape

(980, 3)

In [None]:
articlesDF.head()

Unnamed: 0,Text,Domain,Label
0,SEXIST RORY MCILROY CALLS VOTE FOR FEMALE MEMB...,sports,fake
1,President Trump Is Making It Rain for the Rain...,edu,fake
2,"Gender is a ""hot topic"" in the field of forens...",edu,fake
3,Super Mario Run to leave app store\n\nThe once...,tech,fake
4,Slaven Bilic still has no support of West Ham'...,sports,fake


In [None]:
chartDF = articlesDF.groupby(['Domain', 'Label']).count().rename(columns={'Text': 'Number of Articles'}).reset_index()
chartDF

Unnamed: 0,Domain,Label,Number of Articles
0,biz,fake,40
1,biz,legit,40
2,celebrity,fake,250
3,celebrity,legit,250
4,edu,fake,40
5,edu,legit,40
6,entmt,fake,40
7,entmt,legit,40
8,polit,fake,40
9,polit,legit,40


In [None]:
articlesChart = alt.Chart(chartDF).mark_bar().encode(
    x = alt.X('Number of Articles:Q'),
    y = alt.Y('Label:N', title=None, axis=alt.Axis(labels=False, tickSize=0)),
    color = alt.Color('Label:N'),
    row = alt.Row('Domain:N')
    )
articlesChart