In [1]:
# pyscience imports
import os
import sys
import glob
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("seaborn-darkgrid")
# plt.style.use("dark_background")
sns.set(style="ticks", context="talk")
# %matplotlib inline
# run for jupyter notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
#%%

In [2]:
import requests
import lxml.html as lh
import bs4 as bs
import urllib.request
import io
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

In [3]:
root = '/home/alal/Dropbox/1_Research/league-tables-scraper'
%cd $root

/mnt/d/Dropbox/Dropbox/1_Research/league-tables-scraper


# General Setup 

Get relevant url from [league321](http://league321.com)

# Scotland 

## [scotland league history](http://www.league321.com/scotland-football.html)

In [3]:
tablecols = ['club','played',
                'home_w', 'home_d', 'home_l', 'home_f', 'home_a', 
                'away_w', 'away_d', 'away_l', 'away_f', 'away_a', 
                'pts']

In [4]:
def spfl_scraper(year, colnames = tablecols, baseurl = 'https://spfl.co.uk/league/premiership/archive/'):
    suffix = str(1999 - year)
    url = baseurl + suffix
    r = requests.get(url, headers=header)
    dfs = pd.read_html(r.text, header = 1)
    df = dfs[0] # only keep first table
    df.drop(df.index[[0]], inplace = True) # drop first row
    # drop extra columns
    if len(df.columns) > 13:
        df = df.iloc[:,:13]
    # drop rows with all missing values
    df.dropna(how='all', inplace = True) 
    # try naming columns
    try:
        df.columns = colnames
    except:
        print('Unexpected Table Structure')
    df['year'] = year
    return df

In [5]:
seasons = list(range(1950, 1999, 1))
result = map(spfl_scraper, seasons) 
tables = list(result)

In [6]:
%pwd
%mkdir scottish_league_tables -p
%cd scottish_league_tables

'/home/alal/Dropbox/1_Research/league-tables-scraper'

/home/alal/Dropbox/1_Research/league-tables-scraper/scottish_league_tables


In [7]:
for i in range(0,len(seasons)):
    fn = 'scottish_league_' + str(seasons[i]) + '.csv'
    tables[i].to_csv(fn)

In [8]:
%cd ..

/home/alal/Dropbox/1_Research/league-tables-scraper


# England 

## [England league history](http://www.rsssf.com/engpaul/fla/league.html)

In [4]:
cols = ['Pos', 'Team', 'Played',
        'home_w', 'home_d', 'home_l', 'home_f', 'home_a', 
        'away_w', 'away_d', 'away_l', 'away_f', 'away_a', 
        'Pts'] 

rsssf_base = 'http://www.rsssf.com/engpaul/fla/'

In [5]:
# %cd $root
%mkdir english_league_tables -p
%cd english_league_tables

/mnt/d/Dropbox/Dropbox/1_Research/league-tables-scraper/english_league_tables


In [87]:
def epl_scraper(y1, baseurl = rsssf_base, colnames = cols):
    """
    takes year name and scrapes rsssf page, 
    writes to a plaintext file and csv file (latter may be buggy)
    """
    y2 = y1 % 1900 + 1
    url = baseurl + '{0}-{1}'.format(y1, y2) + '.html'
    r = requests.get(url, headers=header)
    page_content = bs.BeautifulSoup(r.content, "html.parser")
    tables = page_content.find_all("pre")
    tables = tables[0].text.replace('\r','').split('\n')
    starts = [x[0:3] for x in tables]
    startat = starts.index(' 1.')
    stopat  = starts[startat:].index('')
    table = tables[startat:startat+stopat]
    # replace pesky characters that break things
    table = [x.replace('.', '').replace('&amp;', ' ')
             .replace('*', ' ').replace('+', ' ')
             .rstrip() for x in table] 
    # filename
    fn = 'english_league_' + '19' + str(y2) 
    with open(fn + '.txt', 'w') as f:
        f.write('\n'.join(table))
    try:
        leaguetable = pd.read_fwf(fn + '.txt', header = None, names = colnames)
        leaguetable['year'] = y2
        leaguetable.to_csv(fn + '.csv')
    except:
        print(y1 + ' has oddly formatted table. rewrite csv manually')
        return table

In [88]:
seasons = list(range(1950, 1995, 1))
for s in seasons:
    try:
        epl_scraper(s)
    except:
        print(str(s+1) + ' scraping exception')