#Scraping fis athlete data

the following cell scrapes fis ID (xcode), name, nationality, birthdate, gender and club info on all fis athletes from the fis website. 

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 26 16:50:15 2021

@author: dominicbolton
"""

import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd

## can only display 1000 athletes at a time, over 8000 athletes requires 9 different urls
URL_1 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=000'
URL_2 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=1000'
URL_3 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=2000'
URL_4 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=3000'
URL_5 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=4000'
URL_6 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=5000'
URL_7 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=6000'
URL_8 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=7000'
URL_9 = 'https://www.fis-ski.com/DB/ski-jumping/biographies.html?lastname=&firstname=&sectorcode=JP&gendercode=&birthyear=&skiclub=&skis=&nationcode=&fiscode=&status=&search=true&limit=1000&offset=8000'

URLs = [URL_1, URL_2, URL_3, URL_4, URL_5, URL_6, URL_7, URL_8, URL_9]

## loop through them all and create a csv for each page
for i in range(len(URLs)):

    page = requests.get(URLs[i])


    soup = bs(page.content, 'html.parser')

    xcode_class_str = "g-lg-3 g-md-3 g-sm-3 g-xs-4 justify-left flex-sm-wrap flex-xs-wrap"
    name_class_str = "g-lg g-md g-sm g-xs justify-left flex-sm-wrap flex-xs-wrap"
    nation_class_str = "g-lg-1 g-md-2 g-sm-3 g-xs-3 justify-left"
    birthdate_class_str = "g-lg-2 g-md-2 g-sm-3 g-xs-6 justify-right pr-lg-1 pr-md-1 pr-sm-1"
    gender_class_str = "gender__inner"
    club_class_str = "clip-sm"

    xcodes = soup.find_all("div", {"class": xcode_class_str})
    names = soup.find_all("div", {"class": name_class_str})
    nations = soup.find_all("div", {"class": nation_class_str})
    birthdates = soup.find_all("div", {"class": birthdate_class_str})
    genders = soup.find_all("div", {"class": gender_class_str})
    clubs = soup.find_all("div", {"class": club_class_str})

    xcodes = [xcode.text for xcode in xcodes]
    names = [name.text for name in names]
    nations = [nation.text for nation in nations]
    birthdates = [birthdate.text for birthdate in birthdates]
    genders = [gender.text for gender in genders]
    clubs = [club.text for club in clubs]

    xcodes = [xcode.strip() for xcode in xcodes]
    xcodes = [xcode.split() for xcode in xcodes]

    xcode_ls = []
    for xcode in xcodes:
        if len(xcode) == 2:
            xcode_ls.append(xcode[1])
        else:
            xcode_ls.append(xcode[0])

    xcodes = xcode_ls
    nations = [nation.strip() for nation in nations]
    birthdates = pd.Series(birthdates)
    birthdates.replace('\xa0', np.nan)
    genders = [gender.strip() for gender in genders]
    clubs = pd.Series(clubs)
    clubs.replace('\xa0', np.nan)

    Athlete_df = pd.DataFrame({'xcode':xcodes, 'name': names, 'nationality': nations,
                               'birthdate': birthdates, 'gender': genders, 'club': clubs})

    Athlete_df.to_csv('athlete_data_{}.csv'.format(i))


Import them back in and compile to a single csv file.

In [8]:
list_dataframes = []

for i in range(9):
    list_dataframes.append(pd.read_csv('athlete_data_{}.csv'.format(i)))
    
All_Athletes = pd.concat(list_dataframes)

All_Athletes.reset_index(drop=True, inplace=True)

#drop the index cause it apparently didnt like the drop parameter
All_Athletes.drop(columns=['Unnamed: 0'], inplace=True)


All_Athletes.to_csv('all_athlete_data.csv')