In [None]:
import requests
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
import re
import random
import time as t
from google.cloud import storage
from bs4 import BeautifulSoup
import os

# Main PD to GCP Command

In [None]:
# takes the content and transfers to GCP 
# requires a json with api key
api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
counter = 0
nos = 0
folder = 'news_dfs_3.1_3.31'
for item in os.listdir(folder):
    news_df = pd.read_csv('{}/{}'.format(folder, item), index_col = 0, encoding='latin-1')
    if not news_df.empty and item not in domains_used:
        domain = news_df['source'][0].lower()
        if '.com' in domain or '.net' in domain:
            domain = re.sub("(.com)|(.net)", "", domain)
        domain = re.sub(" ", "_", domain)    

        def upload_text(gcp_api_key_path, bucket_name, filename, text):
            storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
            bucket = storage_client.get_bucket(bucket_name)
            pathname = '{domain}/{filename}.txt'.format(domain = domain, filename = filename)
            if not bucket.blob(pathname).exists(storage_client):
                d = bucket.blob(pathname)
                d.upload_from_string(text)
            else:
                print('{} already exists in gcp; skipping'.format(filename))
                return
         
        # removes any video based links
        news_df_no_vids = news_df[~news_df['url'].str.contains('/video')]
        news_df_no_vids = news_df_no_vids[~news_df_no_vids['url'].str.contains('/radio')]
        news_df_no_vids = news_df_no_vids[~news_df_no_vids['url'].str.contains('/audio')]
        
        #print("{} na rate: {}".format(domain, news_df_no_vids['content'].isna().mean()))
        for index, row in news_df.iterrows():
            # construct the datetime metadata
            date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
            time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

            source = re.sub("(.com)|(.net)", "", row['source'])
            metadata = '{}_{}.{}'.format(source, date, time)
            full_text = row['content']
            if type(full_text) is str and len(full_text) > 100:
                print('index: {} uploading document {} from df\n'.format(index, metadata))
                upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
            else:
                nos += 1
                print('index: {} document {} was empty and was skipped\n'.format(index, metadata))
            counter += 1
            t.sleep(random.uniform(0,1))
    domains_used.append(item)
print("total: {}\nNaNs: {}\nNaN Rate: {}".format(counter, nos, nos/counter) )

# Various other scraping cells

In [None]:
# CNN Scraping
# Imports the Google Cloud client library
from google.cloud import storage
%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"
news_df = pd.read_csv('{}/news_cnn.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]
news_df_no_vids = news_df[~news_df['url'].str.contains('video')]
news_df_no_vids = news_df_no_vids[~news_df_no_vids['url'].str.contains('rss')]

def has_paragraph(css_class):
    return css_class is not None and 'paragraph' in css_class

def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'cnn/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)

api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df_no_vids.iterrows():
    try:
        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text
        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')

        tags = document.find_all(class_=has_paragraph) # find 'div' tags that have attribute class = referent
        text_list = []
        for tag in tags:
            if tag.get_text() not in text_list:
                text_list.append(tag.get_text()) 

        full_text = ' '.join(text_list)

        if '(CNN)' in full_text:
            full_text = full_text.split('(CNN)', 1)[1]

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print('error in document {} and skipping'.format(metadata))
        continue

In [None]:
# Fox News Scraping

from google.cloud import storage
%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"
news_df = pd.read_csv('{}/news_foxnews.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]
news_df_no_vids = news_df[~news_df['url'].str.contains('video')]
news_df_no_vids = news_df_no_vids[~news_df_no_vids['url'].str.contains('radio')]

def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'foxnews/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)
    
api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df_no_vids.iterrows():
    try:
        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text

        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')

        div = document.find("div", {"class": "article-body"})


        p_tags = div.findAll('p')


        text_list = []
        for tag in p_tags:
            if 'Foxnews.com' not in tag.get_text() and tag.get_text() not in text_list:
                text_list.append(tag.get_text())


        full_text = ' '.join(text_list)

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print('error in document {} and skipping'.format(metadata))
        continue


In [None]:
# NBC News Scraping
from google.cloud import storage
%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"

news_df = pd.read_csv('{}/news_nbcnews.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]
news_df_no_vids = news_df[~news_df['url'].str.contains('video')]
news_df_no_vids = news_df_no_vids[~news_df_no_vids['url'].str.contains('radio')]

def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'nbcnews/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)
    
api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df_no_vids.iterrows():
    try:

        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text
        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')

        div = document.find("div", {"class": "article-body__content"})


        p_tags = div.findAll('p')

        text_list = []
        for tag in p_tags:
            if 'Privacy Policy' not in tag.get_text() and tag.get_text() not in text_list:
                text_list.append(tag.get_text())


        full_text = ' '.join(text_list)

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print("error, skipping document {}".format(index))
        continue

In [None]:
# AJC scraping


from google.cloud import storage
%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"

def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'ajc/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)

    
news_df = pd.read_csv('{}/news_ajc.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]

api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df.iterrows():
    try:
        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text

        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')

        divs = document.findAll('div', {'class': 'c-section'})

        text_list = []
        for div in divs:
            if div.get_text() not in text_list:
                text_list.append(div.get_text())


        full_text = ' '.join(text_list)

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print('error in document {} and skipping'.format(metadata))
        continue


In [None]:
# azcentral scraping


from google.cloud import storage
%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"
news_df = pd.read_csv('{}/news_azcentral.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]
news_df_no_vids = news_df[~news_df['url'].str.contains('clip')]


def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'azcentral/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)
    
api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df.iterrows():
    try:
        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text

        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')

        divs = document.findAll('p', {'class': 'gnt_ar_b_p'})

        text_list = []
        for div in divs:
            if div.select('p > strong') or div.select('p > em'):
                continue
            if div.get_text() not in text_list:
                text_list.append(div.get_text())


        full_text = ' '.join(text_list)

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print('error in document {} and skipping'.format(metadata))
        continue


In [None]:
# breitbart scraping
from google.cloud import storage
%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"

def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'breitbart/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)
    
news_df = pd.read_csv('{}/news_breitbart.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]
news_df_no_vids = news_df[~news_df['url'].str.contains('clip')]

api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df.iterrows():
    try:
        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text

        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')
        
        main_div = document.find('div', {'class': 'entry-content'})
        divs = main_div.findAll('p')

        text_list = []
        for div in divs:
            if div.select('p > strong') or div.select('p > em') or "Follow @" in div.get_text():
                continue
            if div.get_text() not in text_list:
                text_list.append(div.get_text())


        full_text = ' '.join(text_list)

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print('error in document {} and skipping'.format(metadata))
        continue


In [None]:
#USA TODAY Scraping
from google.cloud import storage
%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"
def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'usatoday/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)
    
    
news_df = pd.read_csv('{}/news_usatoday.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]
news_df = news_df[~news_df['url'].str.contains('mmajunkie')]
api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df.iterrows():
    try:
        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text

        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')
        document

        divs = document.findAll('p', {'class': 'gnt_ar_b_p'})

        text_list = []
        for div in divs:
            if div.select('p > strong') or div.select('p > em'):
                continue
            if div.get_text() not in text_list:
                text_list.append(div.get_text())


        full_text = ' '.join(text_list)

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print('error in document {} and skipping'.format(metadata))
        continue


In [None]:
# NY POST
from google.cloud import storage
news_df = pd.read_csv('{}/news_nypost.com_11.01_11.15.csv'.format(folder), index_col = 0)
news_df = news_df[news_df['url'].notnull()]


%env GOOGLE_APPLICATION_CREDENTIALS="/Desktop/Coding/NewsScrape/covid_news_scrape_gcp_key.json"
def upload_text(gcp_api_key_path, bucket_name, filename, text):
    storage_client = storage.Client.from_service_account_json(gcp_api_key_path)
    bucket = storage_client.get_bucket(bucket_name)
    pathname = 'nypost/{}.txt'.format(filename)
    d = bucket.blob(pathname)
    d.upload_from_string(text)
    
api_key_path = 'covid_news_scrape_gcp_key.json'
bucket = 'fan_project_news_stories'
for index, row in news_df.iterrows():
    try:
        # construct the datetime metadata
        date = re.sub( '-', '.', row['publishedAt'].split("T",1)[0])
        time = re.sub(":", '.', row['publishedAt'].split("T",1)[1].split("Z",1)[0])

        source = row['source']
        metadata = '{}_{}.{}'.format(source, date, time)

        # get the text

        resp = requests.get(row['url'])
        news_html = resp.text
        document = BeautifulSoup(news_html, 'html.parser')

        document = document.find('div', {'class': 'entry-content entry-content-read-more'})

        divs = document.findAll('p')

        text_list = []
        for div in divs:
            if div.select('p > strong') or div.select('p > em'):
                continue
            if div.get_text() not in text_list:
                text_list.append(div.get_text())

        full_text = ' '.join(text_list)

        if len(full_text) > 0:
            print('uploading document {} from df'.format(metadata))
            upload_text(api_key_path, bucket, '{metadata}.txt'.format(metadata=metadata), full_text)
        else:
            print('document {} was empty and was skipped'.format(metadata))
        t.sleep(random.uniform(1,3))
    except:
        print('error in document {} and skipping'.format(metadata))
        continue