### Nettoyage : suppression ou traitement des valeurs manquantes et incohérentes.

In [465]:
import pandas as pd
import os
import matplotlib.pyplot as plt

In [466]:
raw_teams_path = '../data/raw/teams'
processed_path = '../data/processed/'
processed_grouped_data = os.path.join(processed_path, 'grouped-teams')

teams = os.listdir(raw_teams_path)

- Grouper les dataframes

In [467]:
players_dataframes = pd.DataFrame()
matches_dataframes = pd.DataFrame()

os.makedirs(processed_grouped_data, exist_ok=True)

for team_name in teams:

    player_df = pd.read_csv(os.path.join(raw_teams_path, team_name) + '/players.csv')
    player_df['team'] = team_name
    players_dataframes = pd.concat([players_dataframes, player_df], ignore_index=True)

    match_df = pd.read_csv(os.path.join(raw_teams_path, team_name) + '/matches.csv')
    match_df['team'] = team_name
    matches_dataframes = pd.concat([matches_dataframes, match_df], ignore_index=True)

    players_dataframes.to_csv(os.path.join(processed_grouped_data, 'players.csv'), index=False)
    matches_dataframes.to_csv(os.path.join(processed_grouped_data, 'matches.csv'), index=False)

    



##### Chargement de dataframes

In [468]:
players_dataframe = pd.read_csv(os.path.join(processed_grouped_data, 'players.csv'))
matches_dataframe = pd.read_csv(os.path.join(processed_grouped_data, 'matches.csv'))

##### Identification de valeurs manquantes

- Joueurs

In [469]:
print('-- Valeur manquantes (joueurs):')
print(players_dataframe.isna().sum(), '\n')


-- Valeur manquantes (joueurs):
Player      0
Nation      6
Pos         0
Age         5
MP          0
Starts      0
Min       128
90s       128
Gls       128
Ast       128
G+A       128
G-PK      128
PK        128
PKatt     128
CrdY      128
CrdR      128
team        0
dtype: int64 



In [470]:
print('-- Valeur manquantes (matches):')
print(matches_dataframe.isna().sum(), '\n')


-- Valeur manquantes (matches):
Date               0
Time               0
Comp               0
Round              0
Day                0
Venue              0
Result             0
GF                 0
GA                 0
Opponent           0
xG               126
xGA              126
Poss              22
Attendance         6
Captain            0
Formation          0
Opp Formation      0
Referee            5
team               0
dtype: int64 



##### Nettoyage: Traitement de valeurs manquantes

- Remplacer les statistiques de joueurs qui n'ont jamais joué un match, par 0

In [471]:
players_dataframe.loc[players_dataframe['MP'] == 0, 'Min':'CrdR'] = 0

- Conversion en entier

In [472]:
matches_dataframe['Attendance'] = matches_dataframe['Attendance'].apply(lambda x: int(x.replace(',', '')) if type(x) == str else x)

- Remplacer par mode, mediane ou moyenne

In [473]:
players_dataframe['Nation'] = players_dataframe['Nation'].fillna(players_dataframe['Nation'].mode()[0])
players_dataframe['Age'] = players_dataframe['Age'].fillna(players_dataframe['Age'].median())

matches_dataframe['Poss'] = matches_dataframe['Poss'].fillna(matches_dataframe['Poss'].median())
matches_dataframe['Attendance'] = matches_dataframe['Attendance'].fillna(matches_dataframe['Attendance'].median())

- Remplacer par "Unknown"

In [474]:
matches_dataframe['Referee'] = matches_dataframe['Referee'].fillna("Unknown")

- Remplacer par le moyenne de l'equipe

In [475]:
matches_dataframe['xG'] = matches_dataframe.groupby('team')['xG'].transform(lambda x: x.fillna(round(x.mean(), 1)))
matches_dataframe['xGA'] = matches_dataframe.groupby('team')['xGA'].transform(lambda x: x.fillna(round(x.mean(), 1)))

- Après le traitement de valeurs manquantes

In [476]:
print('-- Valeur manquantes (joueurs):')
print(players_dataframe.isna().sum(), '\n')


-- Valeur manquantes (joueurs):
Player    0
Nation    0
Pos       0
Age       0
MP        0
Starts    0
Min       0
90s       0
Gls       0
Ast       0
G+A       0
G-PK      0
PK        0
PKatt     0
CrdY      0
CrdR      0
team      0
dtype: int64 



In [477]:
print('-- Valeur manquantes (matches):')
print(matches_dataframe.isna().sum(), '\n')

-- Valeur manquantes (matches):
Date             0
Time             0
Comp             0
Round            0
Day              0
Venue            0
Result           0
GF               0
GA               0
Opponent         0
xG               0
xGA              0
Poss             0
Attendance       0
Captain          0
Formation        0
Opp Formation    0
Referee          0
team             0
dtype: int64 



##### Standardisation:

- Types de colonnes

In [478]:
print(players_dataframe.dtypes)

Player     object
Nation     object
Pos        object
Age       float64
MP          int64
Starts      int64
Min        object
90s       float64
Gls       float64
Ast       float64
G+A       float64
G-PK      float64
PK        float64
PKatt     float64
CrdY      float64
CrdR      float64
team       object
dtype: object


In [479]:
print(matches_dataframe.dtypes)

Date              object
Time              object
Comp              object
Round             object
Day               object
Venue             object
Result            object
GF                object
GA                object
Opponent          object
xG               float64
xGA              float64
Poss             float64
Attendance       float64
Captain           object
Formation         object
Opp Formation     object
Referee           object
team              object
dtype: object


- Table joueurs

In [480]:
players_dataframe['Min'] = players_dataframe['Min'].apply(lambda x: int(str(x).replace(',', '')))
players_dataframe['Age'] = players_dataframe['Age'].apply(lambda x: int(x))
players_dataframe['Gls'] = players_dataframe['Gls'].apply(lambda x: int(x))
players_dataframe['Ast'] = players_dataframe['Ast'].apply(lambda x: int(x))
players_dataframe['G+A'] = players_dataframe['G+A'].apply(lambda x: int(x))
players_dataframe['G-PK'] = players_dataframe['G-PK'].apply(lambda x: int(x))
players_dataframe['PK'] = players_dataframe['PK'].apply(lambda x: int(x))
players_dataframe['PKatt'] = players_dataframe['PKatt'].apply(lambda x: int(x))
players_dataframe['CrdY'] = players_dataframe['CrdY'].apply(lambda x: int(x))
players_dataframe['CrdR'] = players_dataframe['CrdR'].apply(lambda x: int(x))


- Table matches

In [481]:
# Garde la premiere partie (si le temps a plusieurs parties)
matches_dataframe['Time'] = matches_dataframe['Time'].apply(lambda x: x.split(' ')[0])

# Le nom de l'adversaire parfois contient des caracteres supplémentaire au debut
matches_dataframe['Opponent'] = matches_dataframe['Opponent'].apply(lambda x: x[3:] if len(x.split(' ')[0]) == 2 else x)

# Supprimer la virgule et convertir en entier
matches_dataframe['Attendance'] = matches_dataframe['Attendance'].apply(lambda x: int(x.replace(',', '')) if type(x) == str else x)

# Supprimer le caractère supplémentaire s'il existe
matches_dataframe['Formation'] = matches_dataframe['Formation'].apply(lambda x: x.replace('◆', ''))
matches_dataframe['Opp Formation'] = matches_dataframe['Opp Formation'].apply(lambda x: x.replace('◆', ''))

# Remplacer les valeurs manquantes
matches_dataframe['Attendance'] = matches_dataframe['Attendance'].fillna(int(matches_dataframe['Attendance'].median()))
matches_dataframe['Poss'] = matches_dataframe['Poss'].fillna(matches_dataframe['Poss'].median())
matches_dataframe['xG'] = matches_dataframe['xG'].fillna(round(matches_dataframe['xG'].mean(), 1))
matches_dataframe['xGA'] = matches_dataframe['xGA'].fillna(round(matches_dataframe['xGA'].mean(), 1))

matches_dataframe['Attendance'] = matches_dataframe['Attendance'].apply(lambda x: int(x))
matches_dataframe['Poss'] = matches_dataframe['Poss'].apply(lambda x: int(x))


- Traiter les lignes ou le resultat est un match nul: Garder le nombre de tirs au but et changer les resultats

In [482]:
small_df = matches_dataframe.loc[matches_dataframe['GA'].str.split(' ').str.len() == 2, ['Result', 'GF', 'GA']]

small_df['GF'] = small_df["GF"].apply(lambda x: int(x.split(" ")[-1][1:-1]))

small_df['GA'] = small_df["GA"].apply(lambda x: int(x.split(" ")[-1][1:-1]))

small_df["Result"] = small_df.apply(lambda x: "W" if x["GF"] > x["GA"] else "L", axis=1)

matches_dataframe.loc[matches_dataframe['GA'].str.split(' ').str.len() == 2, ['Result', 'GF', 'GA']] = small_df

- Suppression des matches doublons

In [483]:
matches_dataframe = matches_dataframe[matches_dataframe["Venue"] == "Home"]

##### Sauvegarder les resultats

In [484]:
processed_path = '../data/processed/'
processed_cleaned_data = os.path.join(processed_path, 'cleaned-teams')

os.makedirs(processed_cleaned_data, exist_ok=True)

players_dataframe.to_csv(processed_cleaned_data + '/players.csv', index=False)
matches_dataframe.to_csv(processed_cleaned_data + '/matches.csv', index=False)