In [168]:
!pip install -r requirements.txt



In [169]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time

In [170]:
import functions as fc

In [171]:
url_ffe = 'http://www.echecs.asso.fr/'
suffix_all_clubs = 'ListeClubs.aspx?Action=CLUBCOMITE&ComiteRef='
suffix_all_players_club = 'ListeJoueurs.aspx?Action=JOUEURCLUBREF&ClubRef='

## Faire liste clubs

In [172]:
def get_infos_club_from_row(row):
    fields = row.find_all('td')
    departement = fields[0].text.strip()
    commune = fields[1].text.strip()
    club = fields[2].text.strip()
    try:
        suffix_club = fields[2].find('a').get('href')
        ref_club = ''.join(c for c in suffix_club if c.isdigit())
    except:
        suffix_club = None
        ref_club = None
    return [club, ref_club, suffix_club, commune, departement]

In [173]:
def get_all_clubs_in_departement(departement):
    
    liste_clubs_departement = []
    
    url = url_ffe + suffix_all_clubs + "{:02d}".format(departement)
    request_text = requests.get(url).text
    soup = BeautifulSoup(request_text, 'html.parser')
    
    try:
        zone = soup.find('div', {'class':'page-mid'})
        tableau = zone.find('table')
        rows = tableau.find_all('tr')

        for i in range(1,len(rows)):
            liste_clubs_departement.append(get_infos_club_from_row(rows[i]))
    except:
        pass
    return liste_clubs_departement

In [174]:
def get_all_clubs_france():
    liste_clubs = []
    for departement in tqdm(range(100)):
        liste_clubs += get_all_clubs_in_departement(departement)
        time.sleep(0.5)
    return liste_clubs

In [175]:
liste_clubs = get_all_clubs_france()

100%|██████████| 100/100 [01:02<00:00,  1.60it/s]


In [176]:
df_clubs = pd.DataFrame(liste_clubs, columns = ['club_name', 'club_reference', 'suffix_url', 'commune', 'departement'])

In [177]:
df_clubs

Unnamed: 0,club_name,club_reference,suffix_url,commune,departement
0,La Tour de Bage,2833,FicheClub.aspx?Ref=2833,BAGE LA VILLE,01
1,Club d'Echecs de Belley,3232,FicheClub.aspx?Ref=3232,BELLEY,01
2,Cercle d'Echecs Bressan,1168,FicheClub.aspx?Ref=1168,BOURG EN BRESSE,01
3,La Tour de la Dombes,1171,FicheClub.aspx?Ref=1171,CHATILLON SUR CHALARONNE,01
4,Amicale Echecs,2725,FicheClub.aspx?Ref=2725,GEX,01
...,...,...,...,...,...
812,C.E. Air France Siege Roissy,934,FicheClub.aspx?Ref=934,ROISSY EN FRANCE,95
813,L'Echiquier de Santeuil,2506,FicheClub.aspx?Ref=2506,SANTEUIL,95
814,L'Echiquéenne de Survilliers,2185,FicheClub.aspx?Ref=2185,SURVILLIERS,95
815,Le Pion Passé Vaurealien,2810,FicheClub.aspx?Ref=2810,VAUREAL,95


## Faire liste joueurs

In [178]:
def get_infos_player_from_row(row):
    fields = row.find_all('td')
    elo_lent = int(''.join(c for c in fields[4].text if c.isdigit()))
    elo_rapide = int(''.join(c for c in fields[5].text if c.isdigit()))
    elo_blitz = int(''.join(c for c in fields[6].text if c.isdigit()))
    category = fields[7].text.strip()
    age = category[:3]
    sex = category[-1]
    club_name = fields[9].text.strip()
    return [elo_lent, elo_rapide, elo_blitz, category, age, sex, club_name]

In [179]:
def get_all_players_in_club(ref_club):
    
    liste_players_clubs = []
    
    url = url_ffe + suffix_all_players_club + str(ref_club)
    request_text = requests.get(url).text
    soup = BeautifulSoup(request_text, 'html.parser')
    
    try:
        zone = soup.find('div', {'class':'page-mid'})
        tableau = zone.find('table')
        rows = tableau.find_all('tr')
        
        for i in range(1,len(rows)):
            liste_players_clubs.append(get_infos_player_from_row(rows[i]))
    except:
        pass
    return liste_players_clubs

In [180]:
def get_all_players_france():
    liste_players = []
    for club in tqdm(df_clubs['club_reference'].drop_duplicates()):
        liste_players += get_all_players_in_club(club)
        time.sleep(0.5)
    return liste_players

In [181]:
liste_players = get_all_players_france()

100%|██████████| 817/817 [09:58<00:00,  1.37it/s]


In [182]:
df_players = pd.DataFrame(liste_players, columns = ['elo_lent', 'elo_rapide', 'elo_blitz', 'category', 'age', 'sex', 'club_name'])

In [183]:
df_players

Unnamed: 0,elo_lent,elo_rapide,elo_blitz,category,age,sex,club_name
0,1009,960,960,PouM,Pou,M,La Tour de Bage
1,1199,1199,1199,SenF,Sen,F,La Tour de Bage
2,1199,1260,1260,BenM,Ben,M,La Tour de Bage
3,1009,799,799,PouM,Pou,M,La Tour de Bage
4,1199,999,999,BenF,Ben,F,La Tour de Bage
...,...,...,...,...,...,...,...
21822,1099,999,999,PupF,Pup,F,Carre Clay - Chess Boxing
21823,1399,1199,1199,SenM,Sen,M,Carre Clay - Chess Boxing
21824,1199,999,999,BenF,Ben,F,Carre Clay - Chess Boxing
21825,1399,1199,1199,JunF,Jun,F,Carre Clay - Chess Boxing


## Jointure

In [184]:
df = pd.merge(df_players, df_clubs, on='club_name')

In [187]:
df

Unnamed: 0,elo_lent,elo_rapide,elo_blitz,category,age,sex,club_name,club_reference,suffix_url,commune,departement
0,1009,960,960,PouM,Pou,M,La Tour de Bage,2833,FicheClub.aspx?Ref=2833,BAGE LA VILLE,01
1,1199,1199,1199,SenF,Sen,F,La Tour de Bage,2833,FicheClub.aspx?Ref=2833,BAGE LA VILLE,01
2,1199,1260,1260,BenM,Ben,M,La Tour de Bage,2833,FicheClub.aspx?Ref=2833,BAGE LA VILLE,01
3,1009,799,799,PouM,Pou,M,La Tour de Bage,2833,FicheClub.aspx?Ref=2833,BAGE LA VILLE,01
4,1199,999,999,BenF,Ben,F,La Tour de Bage,2833,FicheClub.aspx?Ref=2833,BAGE LA VILLE,01
...,...,...,...,...,...,...,...,...,...,...,...
20575,1099,999,999,PupF,Pup,F,Carre Clay - Chess Boxing,3256,FicheClub.aspx?Ref=3256,VILLIERS LE BEL,95
20576,1399,1199,1199,SenM,Sen,M,Carre Clay - Chess Boxing,3256,FicheClub.aspx?Ref=3256,VILLIERS LE BEL,95
20577,1199,999,999,BenF,Ben,F,Carre Clay - Chess Boxing,3256,FicheClub.aspx?Ref=3256,VILLIERS LE BEL,95
20578,1399,1199,1199,JunF,Jun,F,Carre Clay - Chess Boxing,3256,FicheClub.aspx?Ref=3256,VILLIERS LE BEL,95


In [193]:
df[int(df['elo_lent']) > 2400]

TypeError: cannot convert the series to <class 'int'>