In [4]:
import pandas as pd
import requests
import os
from datetime import datetime

<h1>Téléchargement des données par saison de foot</h1>

In [2]:
# Working directory
path = os.getcwd()
path_dataset = os.path.join(path,"Datasets")

In [3]:
# Top 5 Leagues
leagues = {"E0":"England", 
           "I1":"Italy", 
           "D1":"Germany", 
           "SP1":"Spain", 
           "F1":"France"}

In [7]:
def season_list(number_season=1):
    seasons = []
    LIMIT_SAISON = 2005
    year = datetime.today().year
    # limit the max old season to 2005-2006
    if (year - number_season) < LIMIT_SAISON:
        number_season = year - LIMIT_SAISON
    #Generate season ex: ["0809",..,"1819","1920","2021",...]
    for i in range(number_season):
        year_before = year-1
        season = str(year_before)[2:] + str(year)[2:]
        seasons.append(season)
        year -= 1
    return seasons

In [11]:
# URL source dataset
url = "https://www.football-data.co.uk/mmz4281"

In [44]:
def download_datasets(league, season):
    season = str(season)
    pathdir = os.path.join(path_dataset, leagues[league])
    # Create Dataset directory if not exist
    if os.path.isdir(pathdir) == False:
        os.makedirs(pathdir)

    file_url = league+"csv"
    full_url = os.path.join(url,season,file_url)
    response = requests.get(full_url)
    file_data = os.path.join(pathdir, season+".csv")
    open(file_data, "wb").write(response.content)

In [171]:
# Download Dataset of 15 last season by top 5 leagues
seasons = season_list(15)
n_season = len(seasons)
print("Download of {} last season by League".format(n_season))

for league in leagues:
    name = leagues[league]
    n_iter = 1
    for season in seasons:
        log = "Season downloaded: {}/{} for {} League".format(n_iter, n_season, name)
        download_datasets(league, season)
        print(log,end="\r")
        n_iter += 1
    print("")

Download of 15 last season by League
Season downloaded: 15/15 for England League
Season downloaded: 15/15 for Italy League
Season downloaded: 15/15 for Germany League
Season downloaded: 15/15 for Spain League
Season downloaded: 15/15 for France League


<h1>Traitement et sélection des colonnes exploitées</h1>

In [172]:
def float2int(df):
    df = df.dropna(axis=0)
    for index, typeof in df.dtypes.items():
        if typeof == "float64":
            df[index] = df[index].astype(int)
    return df

In [174]:
# Selected columns
cols = ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 
        'HS', 'AS', 'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']

for dirs, _, files in os.walk(path_dataset):
    for f in files:
        path_file = dirs.split('/')[-1]
        outfile = os.path.join(dirs, path_file+"-"+f)
        file = os.path.join(dirs,f)
        df = pd.read_csv(file)
        try:
            df = df[cols]
            df = float2int(df)
        except:
            print(file)
        s = f.split('.')
        df['Season'] = str(s[0][:2])+"-"+str(s[0][2:])
        df.to_csv(outfile, index=None)
        os.remove(os.path.join(dirs,f))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[index] = df[index].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Season'] = str(s[0][:2])+"-"+str(s[0][2:])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[index] = df[index].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

#### Cas d'étude sur la saison premier league 2011/2012. Qui fut partie des meilleures saisons disputées ces dernières années

In [193]:
# Choose one dataset for exploring and analysis data
infile = os.path.join(path_dataset,"England/England-1112.csv")
df = pd.read_csv(infile)

In [194]:
team_home = dict(df.groupby('HomeTeam').size()).keys()
team_away = dict(df.groupby('AwayTeam').size()).keys()
teams = set(team_home) | set(team_away)

cols = ["Points","Games", "Win", "Draw", "Loss", "Scored","Conceded","DiffGoal"]
classement = pd.DataFrame(data=0, index=teams, columns=cols)
WIN, DRAW = 3, 1

for index, rows in df.iterrows():
    home_team = rows['HomeTeam']
    away_team = rows['AwayTeam']
    home_scored = rows['FTHG']
    home_conceded = rows['FTAG']
    ftr = rows['FTR']  
    # Goal for Home team and Away
    classement["Games"][home_team] += 1
    classement["Scored"][home_team] += home_scored
    classement["Conceded"][home_team] += home_conceded
    classement["DiffGoal"][home_team] += home_scored - home_conceded
    classement["Games"][away_team] += 1
    classement["Scored"][away_team] +=  home_conceded
    classement["Conceded"][away_team] += home_scored
    classement["DiffGoal"][away_team] += home_conceded - home_scored
    
    if ftr == "H":
        classement["Points"][home_team] += WIN
        classement['Win'][home_team] += 1
        classement['Loss'][away_team] += 1
    elif ftr == "A":
        classement["Points"][away_team] += WIN
        classement['Win'][away_team] += 1
        classement['Loss'][home_team] += 1
    elif ftr == "D":
        classement["Points"][home_team] += DRAW
        classement["Points"][away_team] += DRAW
        classement['Draw'][home_team] += 1
        classement['Draw'][away_team] += 1
classement.sort_values(["Points","DiffGoal"], ascending=False, inplace=True)

In [195]:
def is_champion(df):
    MIN_MATCH = 10
    teams = len(df.index)
    matchs = (teams-1)*2
    # Pour eviter le cas des debuts de championnat on fixe un minimum de 18 matchs
    if matchs < MIN_MATCH:
        matchs = MIN_MATCH
    first_team_point = df["Points"].values[0]
    first_team = df.index[0]
    recap = df[["Points","Games"]][1:]
    result = True
    for team, row in recap.iterrows():
        possible_point = (matchs - row['Games']) * WIN
        if first_team_point <= possible_point + row['Points'] and matchs != row['Games']:
            result = False
            break
    return result, first_team

In [196]:
classement

Unnamed: 0,Points,Games,Win,Draw,Loss,Scored,Conceded,DiffGoal
Man City,89,38,28,5,5,93,29,64
Man United,89,38,28,5,5,89,33,56
Arsenal,70,38,21,7,10,74,49,25
Tottenham,69,38,20,9,9,66,41,25
Newcastle,65,38,19,8,11,56,51,5
Chelsea,64,38,18,10,10,65,46,19
Everton,56,38,15,11,12,50,40,10
Liverpool,52,38,14,10,14,47,40,7
Fulham,52,38,14,10,14,48,51,-3
West Brom,47,38,13,8,17,45,52,-7


In [197]:
is_champion(classement)

(True, 'Man City')

In [198]:
def matrix_score(df):
    teams = classement.index
    matrix = pd.DataFrame(data="", index=teams, columns=teams)
    for index, row in df.iterrows():
        result = str(row['FTHG'])+"-"+ str(row['FTAG'])
        matrix[row['HomeTeam']][row['AwayTeam']] = result
    return matrix

In [199]:
matrix_score(df)

Unnamed: 0,Man City,Man United,Arsenal,Tottenham,Newcastle,Chelsea,Everton,Liverpool,Fulham,West Brom,Swansea,Norwich,Sunderland,Stoke,Wigan,Aston Villa,QPR,Bolton,Blackburn,Wolves
Man City,,1-6,1-0,1-5,0-2,2-1,1-0,1-1,2-2,0-0,1-0,1-6,1-0,1-1,0-1,0-1,2-3,2-3,0-4,0-2
Man United,1-0,,1-2,1-3,3-0,3-3,0-1,1-1,0-5,1-2,0-1,1-2,0-1,1-1,1-0,0-1,0-2,0-5,0-2,0-5
Arsenal,1-0,8-2,,2-1,0-0,3-5,0-1,1-2,2-1,2-3,3-2,1-2,1-2,1-1,0-4,1-2,2-1,0-0,4-3,0-3
Tottenham,3-2,3-0,5-2,,2-2,0-0,1-0,0-0,1-3,1-3,1-1,0-2,0-0,2-1,1-2,1-1,1-0,1-4,1-2,0-2
Newcastle,3-1,1-1,2-1,5-0,,0-2,3-1,3-1,5-2,1-3,0-2,4-2,0-1,1-3,4-0,1-1,0-0,0-2,0-2,1-2
Chelsea,2-1,3-1,0-0,1-1,0-3,,2-0,4-1,1-1,1-0,1-1,0-0,1-2,0-0,1-1,2-4,1-0,1-5,0-1,1-2
Everton,2-0,4-4,1-0,2-0,2-1,3-1,,3-0,1-3,0-1,0-2,2-2,1-1,1-1,1-1,1-1,1-1,0-2,0-1,0-0
Liverpool,3-0,2-1,0-2,4-0,2-0,1-2,0-2,,1-0,0-2,1-0,0-3,1-0,1-0,0-0,0-2,3-2,3-1,2-3,0-3
Fulham,3-0,1-0,1-1,2-0,2-1,1-1,4-0,0-1,,0-0,2-0,1-1,0-0,2-0,0-2,1-0,0-1,0-3,3-1,2-0
West Brom,4-0,2-0,3-0,1-0,2-3,2-1,2-0,0-1,1-1,,3-0,0-1,2-2,1-2,1-1,1-2,1-1,2-2,1-2,1-5


In [201]:
# Matrix goal difference by match season based on matrix score
def matrix_diff_score(diffgoal):
    matrix_df = matrix_score(df)
    for i, row in matrix_df.iterrows():
        for u,score in row.items():
            if u == i:
                matrix_df[u][i] = "-"
            if score != "":
                tmp = score.split('-')
                d = int(tmp[0]) - int(tmp[1])
                if abs(d) == diffgoal:
                    matrix_df[u][i] = score
                else:
                    matrix_df[u][i] = ""
    return matrix_df

In [202]:
matrix_diff_score(3)

Unnamed: 0,Man City,Man United,Arsenal,Tottenham,Newcastle,Chelsea,Everton,Liverpool,Fulham,West Brom,Swansea,Norwich,Sunderland,Stoke,Wigan,Aston Villa,QPR,Bolton,Blackburn,Wolves
Man City,-,,,,,,,,,,,,,,,,,,,
Man United,,-,,,3-0,,,,,,,,,,,,,,,
Arsenal,,,-,,,,,,,,,,,,,,,,,0-3
Tottenham,,3-0,5-2,-,,,,,,,,,,,,,,1-4,,
Newcastle,,,,,-,,,,5-2,,,,,,,,,,,
Chelsea,,,,,0-3,-,,4-1,,,,,,,,,,,,
Everton,,,,,,,-,3-0,,,,,,,,,,,,
Liverpool,3-0,,,,,,,-,,,,0-3,,,,,,,,0-3
Fulham,3-0,,,,,,,,-,,,,,,,,,0-3,,
West Brom,,,3-0,,,,,,,-,3-0,,,,,,,,,
