# Urls from Mediacloud to news articles:

**Tasks:**
- Retrieve html data from urls
- Use NewsPlease to get articles from html data

In [5]:
from dotenv import load_dotenv
import os, mediacloud.api
import datetime
import json
import pandas as pd 
from IPython.display import JSON
import mediacloud.tags
import csv
import requests
import time
import newsplease
from tqdm import tqdm
from newsplease import NewsPlease
from urllib.parse import urlparse

In [3]:
# First, we load the url data retrieved from Mediacloud
mc_urls = pd.read_csv('urls.csv')
mc_urls.head() 

Unnamed: 0,stories_id,publish_date,title,url,language,ap_syndicated,themes,media_id,media_name,media_url,exclude_dupe,exclude_no_date,Date,mnth_yr
0,318516682,2015-02-13 15:13:13,Republicans are pledging to win more of the bl...,http://feeds.voices.washingtonpost.com/c/34656...,en,False,,2,Washington Post,http://washingtonpost.com,False,False,2015-02-13,"13 February, 2015"
1,318811193,2015-02-13 11:42:46,Elisabeth MacNamara: 6 Ways the League of Wom...,http://www.huffingtonpost.com/elisabeth-macnam...,en,False,,27502,HuffPost,http://www.huffingtonpost.com/#,False,False,2015-02-13,"13 February, 2015"
2,319094668,2015-02-10 04:55:04,Robert M. Brandon: Opportunities for Effectiv...,http://www.huffingtonpost.com/robert-m-brandon...,en,False,,27502,HuffPost,http://www.huffingtonpost.com/#,False,False,2015-02-10,"10 February, 2015"
3,320942194,2015-02-22 11:42:46,Mary Bottari: Scott Walker Pushes ALEC 'Right...,http://www.huffingtonpost.com/mary-bottari/sco...,en,False,,27502,HuffPost,http://www.huffingtonpost.com/#,False,False,2015-02-22,"22 February, 2015"
4,321743370,2015-02-25 09:03:01,Ted Strickland Announces He's Running For The ...,http://www.huffingtonpost.com/2015/02/25/ted-s...,en,False,,27502,HuffPost,http://www.huffingtonpost.com/#,False,False,2015-02-25,"25 February, 2015"


In [4]:
# We create a list of urls from pandas dataframe
urls = mc_urls['url'].to_list()
len(urls)

4730

In [None]:
# retrieve html pages from list of urls 
allpages = {}
for url in tqdm(urls):
    try:
        r = requests.get(url)
        if r.status_code==200:
            allpages[url] = r.text #only read the html text if page exists
        else:
            print(f'Downloading {url} returned status code {r.status_code}')
    except:
        print(f'some error occurred when downloading {url}')
with open('TMP.json',  mode='w') as f:
    json.dump(allpages, f)

In [7]:
len(allpages) # check how many pages were retrieved


4035

In [8]:
allpages_parsed = {}
for k, v in tqdm(allpages.items()):
    allpages_parsed[k] = NewsPlease.from_html(v) # use NewsPlease here to download the stories from the html info retrieved

100%|██████████| 4035/4035 [24:22<00:00,  2.76it/s]


In [10]:
with open("output_newsplease.json", mode = 'w') as f: #write json file with media data
    for k, v in allpages_parsed.items():
        mydict = v.get_serializable_dict()
        mydict['url'] = k
        f.write(json.dumps(mydict))
        f.write("\n")

In [12]:
data = pd.read_json('output_newsplease.json', lines=True) # convert to dataframe
data.head()

Unnamed: 0,authors,date_download,date_modify,date_publish,description,filename,image_url,language,localpath,maintext,source_domain,text,title,title_page,title_rss,url
0,"[President, League Of Women Voters Of The Unit...",,,2015-02-13 21:42:46,,.json,https://img.huffingtonpost.com/asset/default-e...,en,,"Advertisement\nIn honor of our 95 anniversary,...",,,6 Ways the League of Women Voters Has Impacted...,,,http://www.huffingtonpost.com/elisabeth-macnam...
1,"[President, Fair Elections Legal Network]",,,2015-02-10 14:55:04,As state legislatures consider opportunities t...,.json,https://img.huffingtonpost.com/asset/default-e...,en,,"As state legislatures shift into high gear, ma...",,,Opportunities for Effective Election Reforms C...,,,http://www.huffingtonpost.com/robert-m-brandon...
2,"[Center For Media, Democracy Alec Exposed]",,,2015-02-22 21:42:46,"Just weeks ago, the Washington Post described ...",.json,https://img.huffingtonpost.com/asset/default-e...,en,,"FILE - In a Tuesday, Nov. 4, 2014 file photo, ...",,,"Scott Walker Pushes ALEC 'Right to Work' Bill,...",,,http://www.huffingtonpost.com/mary-bottari/sco...
3,"[Staff Reporter, The Huffington Post]",,,2015-02-25 14:03:01,,.json,https://img.huffingtonpost.com/asset/default-e...,en,,Former Ohio Gov. Ted Strickland (D) announced ...,,,Ted Strickland Announces He's Running For The ...,,,http://www.huffingtonpost.com/2015/02/25/ted-s...
4,"[Staff Reporter, The Huffington Post]",,,2015-02-26 23:36:04,,.json,https://img.huffingtonpost.com/asset/default-e...,en,,Nevada Senate Minority Leader Michael Roberson...,,,Nevada GOP Pushes New Gun Law Reminiscent Of '...,,,http://www.huffingtonpost.com/2015/02/26/nevad...


In [13]:
# Find source from url link
def urltosource(link):
    source = urlparse(link).netloc
    return source

source = urltosource(data['url'][1]) #checking if it works
source

'www.huffingtonpost.com'

In [14]:
# Create a source column from urls
data['source'] = data['url'].apply(urltosource)
data.head()

Unnamed: 0,authors,date_download,date_modify,date_publish,description,filename,image_url,language,localpath,maintext,source_domain,text,title,title_page,title_rss,url,source
0,"[President, League Of Women Voters Of The Unit...",,,2015-02-13 21:42:46,,.json,https://img.huffingtonpost.com/asset/default-e...,en,,"Advertisement\nIn honor of our 95 anniversary,...",,,6 Ways the League of Women Voters Has Impacted...,,,http://www.huffingtonpost.com/elisabeth-macnam...,www.huffingtonpost.com
1,"[President, Fair Elections Legal Network]",,,2015-02-10 14:55:04,As state legislatures consider opportunities t...,.json,https://img.huffingtonpost.com/asset/default-e...,en,,"As state legislatures shift into high gear, ma...",,,Opportunities for Effective Election Reforms C...,,,http://www.huffingtonpost.com/robert-m-brandon...,www.huffingtonpost.com
2,"[Center For Media, Democracy Alec Exposed]",,,2015-02-22 21:42:46,"Just weeks ago, the Washington Post described ...",.json,https://img.huffingtonpost.com/asset/default-e...,en,,"FILE - In a Tuesday, Nov. 4, 2014 file photo, ...",,,"Scott Walker Pushes ALEC 'Right to Work' Bill,...",,,http://www.huffingtonpost.com/mary-bottari/sco...,www.huffingtonpost.com
3,"[Staff Reporter, The Huffington Post]",,,2015-02-25 14:03:01,,.json,https://img.huffingtonpost.com/asset/default-e...,en,,Former Ohio Gov. Ted Strickland (D) announced ...,,,Ted Strickland Announces He's Running For The ...,,,http://www.huffingtonpost.com/2015/02/25/ted-s...,www.huffingtonpost.com
4,"[Staff Reporter, The Huffington Post]",,,2015-02-26 23:36:04,,.json,https://img.huffingtonpost.com/asset/default-e...,en,,Nevada Senate Minority Leader Michael Roberson...,,,Nevada GOP Pushes New Gun Law Reminiscent Of '...,,,http://www.huffingtonpost.com/2015/02/26/nevad...,www.huffingtonpost.com
