# Development

This script is for development of other functions - just for simplicity of execution etc. Later the code should be moved to *football_functions*, and then deleted from here.

## Block 0: Initial packages and definitions

Just a block to define some stuff that we will probably never be changing

In [1]:
# Base packages for running the script
import sys, datetime
from pathlib import Path

# Set the path and proxy in accordance with our OS
if sys.platform == 'linux':
    HOME_PATH = Path('/home/andreas/Desktop/Projects/Football/')
    proxy_settings = None
else:
    HOME_PATH = Path('c:/Users/amathewl/Desktop/3_Personal_projects/football/')
    proxy_settings = None
    
# Relative paths
data_loc = HOME_PATH / 'Data'
html_loc = data_loc / '01_HTML'
organ_loc = data_loc / '00_Organisation'
story_loc = data_loc / '02_Stories'

# Get today's date for various functions
date_today = datetime.datetime.today().strftime('%Y_%m_%d')

In [2]:
# Define a logger for development
from football_functions.generic import default_logger

dev_logger = default_logger.get_logger(data_loc, date_today, 'development')

## Function to do the full process, but all in one go

Goal is to trim down the full process and do everything in "one go", only saving the final pickle and no HTML. This would be something we would use on a server online where space is more important. The process would be:

* Load in the base URLs

* Loop over each URL - starting with pulling the HTML

* Then with the HTML in memory, look for sublinks

* Then we loop over the sublinks (or the original HTML if it is "not" a sublink type URL)

* Pull the headlines from the sublinks - loop over these and get the story

* Finally save the headline and everything into a pickle

In [3]:
import os, json, pandas as pd

## Function definitions

Need a function to do each step as well as pull them all together.

I think want to have the initial file load in our grand function - and then scrape HTML elsewhere

In [4]:
import re, time, random, requests

def request_html(url, proxy, logger):
    """
    Function that is fed a URL and then tries to pull from it, doing some error checking and handling proxies + sleeping
    """
    
    # Pull the HTML data from the URL using requests
    try:
        response = requests.get(url, proxies = proxy)
        
        # If we 404 then we have connected well
        if response.status_code == 404:
            logger.error('404 Error')
            return (url, '404 error')
        else:
            html_text = response.text
            logger.debug('Successfully pulled HTML from {}'.format(url))
            return (html_text, 'No error')
    except requests.exceptions.RequestException as error:
        # The link timed out
        logger.warning('Have not managed to pull from {} due to a {}'.format(url, error))
        logger.debug('Sleeping for 5 seconds to see if works again')
        time.sleep(5)
        
        # Try again after sleeping for 5 seconds in case we were rejected
        try:
            response = requests.get(url, proxies = proxy)
            
            if response.status_code == 404:
                logger.error('404 Error')
                return (url, '404 error')
            else:
                html_text = response.text
                logger.debug('Successfully pulled HTML from {}'.format(url))
                return (html_text, 'No error')
        except requests.exceptions.RequestException as error:
            logger.error('Still not working')
            return (url, error)

def process_url(url, proxy, logger):
    """
    A function that is given a URL and then checks it over to see if worth scraping
    If valid, will proceed to scrape - so all scraping goes here
    Note that if we produce an error, will return the URL again
    """
    logger.info('Scraping URL {}'.format(url))
    
    # First check if the think we are scraping is valid and remove any spaces
    valid_url = re.match('^http\S*www\.\S*$', url)
    
    if valid_url and 'fake_link' not in url:
        # Always sleep a bit before requesting, just to stop us being rejected so much
        time.sleep(random.uniform(1, 2))
        
        return request_html(valid_url.group(0).rstrip(), proxy, logger)
    else:
        logger.warning('The link found is not valid\n {}\n'.format(url))
        return (url, 'Invalid URL')

In [5]:
import football_functions.reduced.source_specific.bbc.process_html as bbc
import football_functions.reduced.source_specific.dailymail.process_html as dailymail
import football_functions.reduced.source_specific.mirror.process_html as mirror
import football_functions.reduced.source_specific.guardian.process_html as guardian
import football_functions.reduced.source_specific.skysports.process_html as skysports
import football_functions.reduced.source_specific.telegraph.process_html as telegraph

def find_suburls(baseurl_html, base_url, domain, logger):
    """
    Function that will return to us all of the suburls on the page we just downloaded
    """
    links = []

    # Will only process certain URLsfor each domain
    if domain == 'bbc' and 'teams' in base_url:
        links.extend(bbc.get_suburls(baseurl_html, logger))

    elif domain == 'dailymail' and 'sport_football' in base_url:
        links.extend(dailymail.get_suburls(baseurl_html, logger))

    elif domain == 'mirror'  and 'sport_football' in base_url:
        links.extend(mirror.get_suburls(baseurl_html, logger))

    elif domain == 'theguardian'  and 'teams' in base_url:
        links.extend(guardian.get_suburls(baseurl_html, logger))
    else:
        # If we are here then it should not be a suburl and we would just scrape directly
        logger.info('Returning the original URL as only suburl')
        links = [base_url]
    
    if len(links) == 0:
        logger.warning('Have found {} links from {}'.format(len(links), domain))
    
    return links

In [6]:
def extract_headlines(suburl_html, sub_url, domain, logger):
    """
    Function that will extract headlines from the HTML previously downloaded for suburls
    """

    # Now do source specific stuff
    if domain == 'bbc':
        modifier = 'football_teams' in sub_url
        articles_info = bbc.extract_headlines(suburl_html, modifier, logger)

    elif domain == 'dailymail':
        modifier = 'football_index' not in sub_url
        articles_info = dailymail.extract_headlines(suburl_html, modifier, logger)
        
    elif domain == 'theguardian':
        articles_info = guardian.extract_headlines(suburl_html, logger)
        
    elif domain == 'mirror':
        articles_info = mirror.extract_headlines(suburl_html, logger)
    
    elif domain == 'skysports':
        modifier = 'regional' in sub_url
        articles_info = skysports.extract_headlines(suburl_html, modifier, logger)
        
    elif domain == 'telegraph':
        articles_info = telegraph.extract_headlines(suburl_html, logger)

    return articles_info

In [7]:
def get_story(story_html, domain, logger):
    """
    Function to take the HTML and then get the story text according to the domain
    """

    if domain == 'bbc':
        story_details = bbc.get_text(story_html, logger)

    elif domain == 'dailymail':
        story_details = dailymail.get_text(story_html, logger)

    elif domain == 'theguardian':
        story_details = guardian.get_text(story_html, logger)

    elif domain == 'mirror':
        story_details = mirror.get_text(story_html, logger)

    elif domain == 'skysports':
        story_details = skysports.get_text(story_html, logger)

    elif domain == 'telegraph':
        story_details = telegraph.get_text(story_html, logger)

    story_dic = {
        'story_text' : re.sub('([a-z0-9])([A-Z])', r'\1.\2', story_details[0]),
        'story_author' : story_details[1],
        'story_date' : story_details[2],
        'story_twitter' : story_details[3],
        'story_keywords' : story_details[4],
    }

    return story_dic

In [8]:
def check_duplicates(article_link, past_dates):
    """
    Function for checking for duplicates and stopping us processing the same stories over and over
    Due to the way we are saving - this can only really be done for the headlines, but
    we can do it easily by checking those previous files with similar file names and then loading them in
    if the URL is exactly the same, then we won't pull it
    
    This is important to do BEFORE we move to scrape the HTML, as this is the process that really takes longest
    
    1. Get the file extension
    2. Check for the same extension else where
    3. Since the file name is purely from the URL, we exit saying they are the same
    
    I think this is OK to do, as scraping is really what takes time, and we really do have to scrape the suburl HTML
    """
    # Start building the file name out of the link, checking that everything is valid         
    url_reg = re.search('\/([^\/][^www].*)', article_link)

    # Sometimes get weird URLs of other websites
    if url_reg:
        url_extension = url_reg.group(1).replace('/', '_')
    else:
        url_extension = re.sub('[^A-z0-9]', '_', article_link)

    json_name = re.sub('[^A-z0-9_]+', '', url_extension) + '.json'
    
    # Then move through the previous dates and try to find the link
    for past_date in past_dates:
        # THERE MIGHT BE A MUCH BETTER WAY TO CHECK THIS WITHOUT HAVING TO LOOP THROUGH
        candidate_link = past_date / json_name
        
        # If it exists, return true and also tag on date just for ease
        if candidate_link.exists():
            return True, str(past_date/ json_name)
    
    return False, json_name

Thinks left to consider:

* Handling errors / not being able to grab links
* Being frozen out when pulling HTML
* Extra logging - especially info and debugging
* Make sure that we are getting similar / equal results - need to check behaviour for fake links **NEED TO CHECK THAT WE PROPERLY NAME ARTICLES THAT KEEP SAME HEADLINE BUT UPDATE - these need to be reiewed in general and declared as special links**
* Checking out duplicates
* Make everything more Pathlib like
* Instead of saving end result as a dictionary with pickle, maybe it would be better to save as a json, so we could also view it directly and more agnostic

In [None]:
import re

def full_process(baseurl_loc, proxy, logger, save_path, date_today):
    """
    A function that will carry out the full, reduced process of getting the stories
    """
    logger.info('Loading in the base URLs...')
    baseurl_list = []
    with baseurl_loc.open(mode = 'r') as list_file:
        for url in list_file.readlines():
            baseurl_list.append(url.rstrip())
            
    logger.info('Have found {}'.format(len(baseurl_list)))
        
    # We loop over the loaded in base urls
    for base_url in baseurl_list:
        # Get the domain for source specific stuff
        domain = re.search('^.*www\.(.*?)\..*', base_url).group(1)
        
        # Get the previous dates for this domain, to check later
        previous_dates = list((story_loc / domain).iterdir())
        
        # Just start a log to get things going
        logger.info('Starting the process for {} which is from {}'.format(base_url, domain))
        
        # Get the HTML for that URL - first scrape
        baseurl_html = process_url(base_url, proxy, logger)

        if baseurl_html[1] != 'No error':
            logger.warning('Have found an error in the baseline HTML')
            pass
        
        # Then we will feed it into the suburl extractor
        logger.info('Finding the suburls')
        suburl_list = find_suburls(baseurl_html[0], base_url, domain, logger)
        
        # So we loop over the sub_urls that we found, pull the HTML and then pull out the headlines - second scrape
        for sub_url in suburl_list:
            suburl_html = process_url(sub_url, proxy, logger)
            
            if suburl_html[1] != 'No error':
                logger.warning('Have found an error in the suburl HTML for {}'.format(sub_url))
                pass
            
            # Now pull out the headlines - HAVE TO CHECK WHAT HAPPENS TO SPECIAL NON-SUBURL TYPE LINKS
            suburl_headlines = extract_headlines(suburl_html[0], sub_url, domain, logger)
            
            # Now we should loop over the headlines and pull out the story - finally saving
            logger.info('Now looking at the headlines')
            for headline_id in suburl_headlines:
                headline = suburl_headlines[headline_id]
                
                # Beforedoing anything - want to make sure that the headline is not a duplicate
                is_duplicate, json_name = check_duplicates(headline['article_link'], previous_dates)
                
                if is_duplicate:
                    logger.warning('Have found a duplicate link for file named {}'.format(json_name))
                    pass
                
                # First step for good headlines is to pull the HTML - third scrape
                story_html = process_url(headline['article_link'], proxy, logger)
                
                if story_html[1] != 'No error':
                    logger.warning('Have found an error in the story HTML for {}'.format(headline['article_link']))
                    pass
                
                # Then get the text
                story_details = get_story(story_html[0], domain, logger)
                
                # Add on to our headline
                headline['story_text'] = story_details['story_text']
                headline['story_author'] = story_details['story_author']
                headline['story_date'] = story_details['story_date']
                headline['story_twitter'] = story_details['story_twitter']
                headline['story_keywords'] = story_details['story_keywords']
                
                # And finally save
                story_path = save_path / domain / date_today
                
                if not story_path.exists():
                    story_path.mkdir(parents = True)
                
                story_file = story_path / json_name
                
                with story_file.open(mode = 'w') as json_file:
                    json.dump(headline, json_file, indent = 4)

In [None]:
full_process(organ_loc / 'news_sources.txt', proxy_settings, dev_logger, story_loc, date_today)