In [98]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

class HTMLTableParser:

    def parse_url(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return [self.parse_html_table(table) for table in soup.find_all('table')]  

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # Safeguard on Column Titles
        if len(column_names) > 0 and len(column_names) != n_columns:
            column_names = column_names + [""]*(n_columns - len(column_names))

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df

In [132]:
def get_new_team_name(league,team_name):
    return {
        "eng-premier-league": team_name.split(" FC")[0].split(" AFC")[0].split("AFC ")[-1]
    }.get(league,team_name)

In [99]:
hp = HTMLTableParser()

url = "http://www.worldfootball.net/schedule/eng-premier-league-2006-2007-spieltag/1"

all_tables = hp.parse_url(url)

all_tables[3].columns = ['#', '', 'Team', 'M.', 'W', 'D', 'L', 'goals', 'Dif.', 'Pt.']
all_tables[3]

Unnamed: 0,#,Unnamed: 2,Team,M.,W,D,L,goals,Dif.,Pt.
0,1.0,\n\n,\nManchester United\n,1.0,1.0,0.0,0.0,5:1,4.0,3.0
1,2.0,\n\n,\nChelsea FC\n,1.0,1.0,0.0,0.0,3:0,3.0,3.0
2,,\n\n,\nPortsmouth FC\n,1.0,1.0,0.0,0.0,3:0,3.0,3.0
3,4.0,\n\n,\nWest Ham United\n,1.0,1.0,0.0,0.0,3:1,2.0,3.0
4,5.0,\n\n,\nBolton Wanderers\n,1.0,1.0,0.0,0.0,2:0,2.0,3.0
5,6.0,\n\n,\nReading FC\n,1.0,1.0,0.0,0.0,3:2,1.0,3.0
6,7.0,\n\n,\nEverton FC\n,1.0,1.0,0.0,0.0,2:1,1.0,3.0
7,,\n\n,\nNewcastle United\n,1.0,1.0,0.0,0.0,2:1,1.0,3.0
8,9.0,\n\n,\nArsenal FC\n,1.0,0.0,1.0,0.0,1:1,0.0,1.0
9,,\n\n,\nAston Villa\n,1.0,0.0,1.0,0.0,1:1,0.0,1.0


In [100]:
leagues = ["eng-premier-league","esp-primera-division","ita-serie-a","fra-ligue-1",
           "ned-eredivisie","bundesliga","pol-ekstraklasa","sui-super-league"]
seasons = ['2008/2009', '2009/2010', '2010/2011', '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016']
fetures = ["Pts", "Norm_Pts", "Position","Goals_Diff","Pos_Goals","Neg_Goals"]
teams_per_season = {}
stages  = {}

for league in leagues:
    url = "http://www.worldfootball.net/schedule/" + league + "-2008-2009-spieltag/1"
    table = hp.parse_url(url)[3]
    table.columns = ['#', '', 'Team', 'M.', 'W', 'D', 'L', 'goals', 'Dif.', 'Pt.']
    teams_per_season[league] = len(table["Team"])
    num_of_stages = (teams_per_season[league]-1)*2
    stages[league] = range(1, num_of_stages+1)

In [101]:
table_dic = {}

for league in leagues:
    for season in seasons:
        temp_num_of_stages = stages[league]
        for stage in temp_num_of_stages:
            url = "http://www.worldfootball.net/schedule/" + league + "-" + season.replace("/","-") + "-spieltag/" + str(stage)
            table = hp.parse_url(url)[3]
            table.columns = ['#', '', 'Team', 'M.', 'W', 'D', 'L', 'goals', 'Dif.', 'Pt.']
            table = table[["Team", "Pt.", "Dif.", "goals"]]
            table["Team"] = table["Team"].apply(lambda x: x.split("\n")[1])
            table.sort_values("Team", ascending = True, inplace = True)
            table.reset_index(inplace = True)
            table = table[['Team', 'Pt.', "Dif.", "goals", 'index']]
            table.columns = ['Team', 'Pt.', "Dif.", "goals", 'position']
            table["position"] = table["position"] + 1
            table_dic[(league,season,stage)] = table

In [133]:
all_teams   = {}
new_teams   = {}
all_stages  = {}
all_seasons = {}

for league in leagues:
    all_teams[league]   = []
    new_teams[league]   = []
    
    all_seasons[league] = seasons * teams_per_season[league]
    all_stages[league]  = (list(stages[league])+[0]) * len(fetures)
    all_fetures = fetures * len(all_stages[league])

    for season in seasons:
        all_teams[league] = all_teams[league] + list(table_dic[(league,season,1)]["Team"].unique())
    
    all_seasons[league].sort()
    all_stages[league].sort()

    temp_all_teams = all_teams[league]
    for team in temp_all_teams:
        new_teams[league].append(get_new_team_name(league,team))

all_teams = new_teams

In [134]:
table_per_stage = {}

for league in leagues:
    index_arrays   = [all_seasons[league], all_teams[league]]
    columns_arrays = [all_stages[league], all_fetures]

    index_tuples   = list(zip(*index_arrays))
    columns_tuples = list(zip(*columns_arrays))

    index   = pd.MultiIndex.from_tuples(index_tuples, names = ['Season', 'Team'])
    columns = pd.MultiIndex.from_tuples(columns_tuples, names = ['Stage', ''])

    table_per_stage[league] = pd.DataFrame(np.zeros((len(all_teams[league]),len(all_stages[league]))), index = index, columns = columns)

In [135]:
for league in leagues:
    for season in seasons:
        temp_num_of_stages = stages[league]
        for stage in temp_num_of_stages:
            table_per_stage[league][stage]["Pts"][season]        = table_dic[(league,season,stage)]["Pt."]
            table_per_stage[league][stage]["Norm_Pts"][season]   = table_dic[(league,season,stage)]["Pt."]/table_dic[(league,season,stage)]["Pt."].max()
            table_per_stage[league][stage]["Position"][season]   = table_dic[(league,season,stage)]["position"]
            table_per_stage[league][stage]["Goals_Diff"][season] = table_dic[(league,season,stage)]["Dif."]
            table_per_stage[league][stage]["Pos_Goals"][season]  = table_dic[(league,season,stage)]["goals"].apply(lambda x: x.split(":")[0])
            table_per_stage[league][stage]["Neg_Goals"][season]  = table_dic[(league,season,stage)]["goals"].apply(lambda x: x.split(":")[1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [136]:
table_per_stage["ned-eredivisie"]

Unnamed: 0_level_0,Stage,0,0,0,0,0,0,1,1,1,1,...,33,33,33,33,34,34,34,34,34,34
Unnamed: 0_level_1,Unnamed: 1_level_1,Pts,Norm_Pts,Position,Goals_Diff,Pos_Goals,Neg_Goals,Pts,Norm_Pts,Position,Goals_Diff,...,Position,Goals_Diff,Pos_Goals,Neg_Goals,Pts,Norm_Pts,Position,Goals_Diff,Pos_Goals,Neg_Goals
Season,Team,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2008/2009,ADO Den Haag,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0
2008/2009,AFC Ajax,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0
2008/2009,AZ Alkmaar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,80.0,0.0,0.0,0.0,0.0,0.0
2008/2009,De Graafschap,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0
2008/2009,FC Groningen,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,0.0,0.0
2008/2009,FC Twente,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,69.0,0.0,0.0,0.0,0.0,0.0
2008/2009,FC Utrecht,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0
2008/2009,FC Volendam,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0.0
2008/2009,Feyenoord,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,45.0,0.0,0.0,0.0,0.0,0.0
2008/2009,Heracles Almelo,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0


In [126]:
for league in leagues:
    table_per_stage[league].to_csv("databases/table_per_stage/" + league + ".csv")