In [12]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import os

In [13]:
url = "http://www.fifa.com/worldcup/archive/brazil2014/index.html"

In [14]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

In [15]:
tokens = [(op.attrs['value'][:-10]) for op in soup.find_all('option')[1:]]
tokens

['/worldcup/archive/brazil2014/',
 '/worldcup/archive/southafrica2010/',
 '/worldcup/archive/germany2006/',
 '/worldcup/archive/koreajapan2002/',
 '/worldcup/archive/france1998/',
 '/worldcup/archive/usa1994/',
 '/worldcup/archive/italy1990/',
 '/worldcup/archive/mexico1986/',
 '/worldcup/archive/spain1982/',
 '/worldcup/archive/argentina1978/',
 '/worldcup/archive/germany1974/',
 '/worldcup/archive/mexico1970/',
 '/worldcup/archive/england1966/',
 '/worldcup/archive/chile1962/',
 '/worldcup/archive/sweden1958/',
 '/worldcup/archive/switzerland1954/',
 '/worldcup/archive/brazil1950/',
 '/worldcup/archive/france1938/',
 '/worldcup/archive/italy1934/',
 '/worldcup/archive/uruguay1930/']

## Create directory for storing results

In [16]:
PATH = 'World-Cup-DataBase'

In [17]:
"""/italy1934/ -> /1934--italy/"""
def parse_names(token):
    token = token [1:-1]
    return '/' + token[-4:] + '--' + token[:-4] + '/' 

In [194]:
"""Create directory"""

for token in tokens:
    directory = PATH + parse_names(token[17:]) +'statistics'
    if not os.path.exists(directory):
        os.makedirs(directory)
    directory = PATH + parse_names(token[17:]) +'matches'
    if not os.path.exists(directory):
        os.makedirs(directory)
    directory = PATH + parse_names(token[17:]) +'teams'
    if not os.path.exists(directory):
        os.makedirs(directory)

# 1. Statistics

## Player top goals


In [468]:
"""Value to be extracted: CountryCodes, Players, Goals, MinutesPlayed, MatchesPlayed"""
def player_top_goals():
    for token in tokens:
        URL = 'http://www.fifa.com' + token + 'statistics/players/goal-scored.html'
        Countries = []
        Players = []
        Goals = []
        Minutes = []
        Matches = []
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.findChildren('table')[0]
        rows = table.findChildren(['tr'])
        
        for row in rows[1:]:
            Countries.append(row.findAll('td', {'class': 'tbl-playername teamname-nolink'})[0].img['alt'])
            Players  .append(row.findAll('td', {'class': 'tbl-playername teamname-nolink'})[0].text)  
            Minutes  .append(row.findAll('td', {'class': 'tbl-minp'})[0].text)
            Goals    .append(row.findAll('td', {'class': 'tbl-goalfor'})[0].text)
            Matches  .append(row.findAll('td', {'class': 'tbl-mp'})[0].text)

        df = pd.DataFrame({
                'CountryCode': Countries,
                'Player': Players,
                'Goals': Goals,
                'MinutesPlayed': Minutes,
                'MatchesPlayed': Matches
        })
        
        directory = PATH + parse_names(token[17:]) + 'statistics/'
        df.to_csv(directory + 'player_top_goals.csv', index=False)
        
player_top_goals()

## Team top goals

In [20]:
"""Value to be extracted: Country, CountryCode, Goals, GoalsAgainst, MatchesPlayed"""
def team_top_goals():
    for token in tokens:
        URL = 'http://www.fifa.com' + token + 'statistics/teams/goal-scored.html'
        Countries = []
        CountryCodes = []
        Goals = []
        GoalsAg = []
        Matches = []
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.findChildren('table')[0]
        rows = table.findChildren(['tr'])
        for row in rows[1:]:
            country = row.findAll('span', {'class': 't-nText'})[0].text
            if 'Ivoire' in country:
                country = 'Ivory Coast'
            Countries   .append(country)
            CountryCodes.append(row.findAll('span', {'class': 't-nTri'})[0].text)
            Goals       .append(row.findAll('td', {'class': 'tbl-goalfor'})[0].text)
            GoalsAg     .append(row.findAll('td', {'class': 'tbl-goalagainst'})[0].text)
            Matches     .append(row.findAll('td', {'class': 'tbl-mp'})[0].text)
        #A special case for CANADA in 1986
        #The result of CANADA (team top goals) is not showed in the website
        #http://www.fifa.com/worldcup/archive/mexico1986/statistics/teams/goal-scored.html
        if (token[-5:-1]=='1986'):
            Countries   .append('Canada')
            CountryCodes.append('CAN')
            Goals       .append('0')
            GoalsAg     .append('5')
            Matches     .append('3')
        #Same here for TRINIDAD AND TOBAGO at 2006
        if (token[-5:-1]=='2006'):
            Countries   .append('Trinidad and Tobago')
            CountryCodes.append('TRI')
            Goals       .append('0')
            GoalsAg     .append('4')
            Matches     .append('3')
        df = pd.DataFrame({
                'Country': Countries,
                'CountryCode' : CountryCodes,
                'Goals': Goals,
                'GoalsAgainst' : GoalsAg,
                'MatchesPlayed': Matches
        })
        
        directory = PATH + parse_names(token[17:]) + 'statistics/'
        df.to_csv(directory + 'team_top_goals.csv', index=False)
        
team_top_goals()     

## Player top cards

In [470]:
"""Value to be extracted: CountryCodes, Players, Yellow Cards, Red Cards, MatchesPlayed"""
def player_top_cards():
    for token in tokens:
        URL = 'http://www.fifa.com' + token + 'statistics/players/disciplinary.html'
        Countries = []
        Players = []
        Yellows = []
        Reds = []
        Matches = []
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.findChildren('table')[0]
        rows = table.findChildren(['tr'])
        for row in rows[1:]:
            
            Countries.append(row.findAll('td', {'class': 'tbl-playername teamname-nolink'})[0].img['alt'])
            Players  .append(row.findAll('td', {'class': 'tbl-playername teamname-nolink'})[0].text)  
            Yellows  .append(row.findAll('td', {'class': 'tbl-yc'})[0].text)
            Reds     .append(row.findAll('td', {'class': 'tbl-rc'})[0].text)
            Matches  .append(row.findAll('td', {'class': 'tbl-mp'})[0].text)

        df = pd.DataFrame({
                'CountryCode': Countries,
                'Player': Players,
                'YellowCards': Yellows,
                'RedCards': Reds,
                'MatchesPlayed': Matches
        })
        
        directory = PATH + parse_names(token[17:]) + 'statistics/'
        df.to_csv(directory + 'player_top_cards.csv', index=False)
        
player_top_cards()

## Team top cards

In [471]:
"""Value to be extracted: Country, Country Code, Yellow Cards, Red Cards, MatchesPlayed"""
def team_top_cards():
    for token in tokens:
        URL = 'http://www.fifa.com' + token + 'statistics/teams/disciplinary.html'
        Countries = []
        CountryCodes = []
        Yellows = []
        Reds = []
        Matches = []
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.findChildren('table')[0]
        rows = table.findChildren(['tr'])
        for row in rows[1:]:
            country = row.findAll('span', {'class': 't-nText'})[0].text
            if 'Ivoire' in country:
                country = 'Ivory Coast'
            Countries   .append(country)
            CountryCodes.append(row.findAll('span', {'class': 't-nTri'})[0].text)
            Yellows     .append(row.findAll('td', {'class': 'tbl-y'})[0].text)
            Reds        .append(row.findAll('td', {'class': 'tbl-rc'})[0].text)
            Matches     .append(row.findAll('td', {'class': 'tbl-mp'})[0].text)

        df = pd.DataFrame({
                'Country': Countries,
                'CountryCode':CountryCodes,
                'YellowCards': Yellows,
                'RedCards' : Reds,
                'MatchesPlayed': Matches
        })
        
        directory = PATH + parse_names(token[17:]) + 'statistics/'
        df.to_csv(directory + 'team_top_cards.csv', index=False)
        
team_top_cards()

# 2. Matches

In [49]:
"""Data to be extracted: Home team, Away team, Scores, Attendance, Penalty, City, Date"""
def get_match_details():
    for token in tokens:
        URL = 'http://www.fifa.com' + token + 'matches/'
        HomeTeams   = []
        HomeCodes   = []
        AwayTeams   = []
        AwayCodes   = []
        HomeScore   = []
        AwayScore   = []
        Attendance  = []
        Description = []
        City        = []
        Date        = []
        Penalty     = []
        Links       = []
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        
        """Collect all match reports"""
        match_reports_link = []
        for matches in soup.findAll('div', {'class': 'match-list-date anchor'}):
            for link in matches.findAll('a'):
                match_reports_link.append('https://www.fifa.com' + link['href'])
        
        """Start to scrape"""
        for link in match_reports_link:
            Links.append(link)
            req = requests.get(link)
            sou = BeautifulSoup(req.text, 'html.parser')
            penalty = ''
            "Need to know this is a group match or final or semi final...."
            description = sou.findAll('div', {'class': 'mh-i-round'})[0].text
            Description.append(description)
            if ('Group match' in description or 'First stage' in description):
                Penalty.append('')
            else:
                "If this is a knockout match, so is there a penalty at the end ?"
                reasonwin = sou.findAll('span', {'class': 'text-reasonwin'})[0].text
                if 'penalties' in reasonwin:
                    a = reasonwin.split('(')
                    a = a[1].split(')')
                    penalty = a[0]
                    Penalty.append(a[0])
                else:
                    Penalty.append('')
        
        
            home_infos = sou.findAll('div', {'class': 't home'})[0]
            home_name = home_infos.findAll('span', {'class': 't-nText '})
            if (home_name):
                hname = home_name[0].text
            else :
                hname = home_infos.findAll('span', {'class': 't-nText kern'})[0].text
            hcode = home_infos.findAll('span', {'class': 't-nTri'})[0].text
            
            away_infos = sou.findAll('div', {'class': 't away'})[0]
            away_name = away_infos.findAll('span', {'class': 't-nText '})
            if (away_name):
                aname = away_name[0].text
            else :
                aname = away_infos.findAll('span', {'class': 't-nText kern'})[0].text
            acode = away_infos.findAll('span', {'class': 't-nTri'})[0].text
    
    
            "Team codes"
            HomeCodes.append(hcode)
            AwayCodes.append(acode)
            
            "Team names"
            if 'Ivoire' in hname:
                hname = 'Ivory Coast'
            if 'Ivoire' in aname:
                aname = 'Ivory Coast'
            HomeTeams.append(hname)
            AwayTeams.append(aname)

            "Scores"
            [home, away] = sou.findAll('span', {'class': 's-scoreText'})[0].text.split('-')
            home = str(home)
            away = str(away)
            if penalty != '':
                penalty = penalty.split('-')
                home += '(' + penalty[0] +')'
                away += '(' + penalty[1] +')'
            HomeScore.append(home)
            AwayScore.append(away)
            
            """Extra infos: Attendances are not available for 2010 and 2014"""
            if ('brazil2014' in token or 'southafrica2010' in token):
                date = sou.find('div', attrs={'class':'mh-i-datetime'})
                Date.append(date.text.split('-')[0][:-1])
                city = sou.find('span', attrs={'class':'mh-i-venue'})
                City.append(city.text)
                "Attendance cannot be extracted from FIFA.com"
                if ('brazil2014' in token): # Average from FIFA.com
                    Attendance.append('52918')
                else :
                    Attendance.append('49669')
            else :
                table = sou.find('table', attrs={'class':'table match-data'})
                table_body = table.find('tbody')
                row = table_body.find_all('tr')[0]
                cols = row.find_all('td')
                cols = [ele.text.strip() for ele in cols]
                # cols = ['26 June 2006', 'Cologne /FIFA World Cup Stadium, Cologne', '21:00', '45000']
                Date.append(cols[0])
                City.append(cols[1].split('/')[0][:-1])
                Attendance.append(cols[-1])
                
                
        df = pd.DataFrame({
              'HomeTeam'   :  HomeTeams ,
              'HomeCode'   :  HomeCodes ,
              'AwayTeam'   :  AwayTeams ,
              'AwayCode'   :  AwayCodes ,
              'HomeScore'  :  HomeScore ,
              'AwayScore'  :  AwayScore ,
              'Attendance' :  Attendance,
              'Description':  Description,
              'City'       :  City      ,
              'Date'       :  Date   ,
              'Penalty'    :  Penalty,
              'MatchReport':  Links
        })
        
        directory = PATH + parse_names(token[17:]) + 'matches/'
        df.to_csv(directory + 'matches.csv', index=False)
        
get_match_details()

In [50]:
df = pd.read_csv(PATH + '/2014--brazil/matches/matches.csv')
df


Unnamed: 0,Attendance,AwayCode,AwayScore,AwayTeam,City,Date,Description,HomeCode,HomeScore,HomeTeam,MatchReport,Penalty
0,52918,CRO,1,Croatia,Sao Paulo (BRA),12 Jun 2014,First stage - Group A,BRA,3,Brazil,https://www.fifa.com/worldcup/matches/round=25...,
1,52918,CMR,0,Cameroon,Natal (BRA),13 Jun 2014,First stage - Group A,MEX,1,Mexico,https://www.fifa.com/worldcup/matches/round=25...,
2,52918,NED,5,Netherlands,Salvador (BRA),13 Jun 2014,First stage - Group B,ESP,1,Spain,https://www.fifa.com/worldcup/matches/round=25...,
3,52918,AUS,1,Australia,Cuiaba (BRA),13 Jun 2014,First stage - Group B,CHI,3,Chile,https://www.fifa.com/worldcup/matches/round=25...,
4,52918,GRE,0,Greece,Belo Horizonte (BRA),14 Jun 2014,First stage - Group C,COL,3,Colombia,https://www.fifa.com/worldcup/matches/round=25...,
5,52918,CRC,3,Costa Rica,Fortaleza (BRA),14 Jun 2014,First stage - Group D,URU,1,Uruguay,https://www.fifa.com/worldcup/matches/round=25...,
6,52918,ITA,2,Italy,Manaus (BRA),14 Jun 2014,First stage - Group D,ENG,1,England,https://www.fifa.com/worldcup/matches/round=25...,
7,52918,JPN,1,Japan,Recife (BRA),14 Jun 2014,First stage - Group C,CIV,2,Ivory Coast,https://www.fifa.com/worldcup/matches/round=25...,
8,52918,ECU,1,Ecuador,Brasilia (BRA),15 Jun 2014,First stage - Group E,SUI,2,Switzerland,https://www.fifa.com/worldcup/matches/round=25...,
9,52918,HON,0,Honduras,Porto Alegre (BRA),15 Jun 2014,First stage - Group E,FRA,3,France,https://www.fifa.com/worldcup/matches/round=25...,


# 3. Teams

In [463]:
def get_player_infos():
    for token in tokens:
        URL = 'http://www.fifa.com' + token + 'teams/'
        Country = []
        CountryCode = []
        Player = []
        Age = []
        Position = []
        r = requests.get(URL)
        soup = BeautifulSoup(r.text, 'html.parser')
        
        all_teams_link = soup.findAll('div', {'class': 'team-map'})[0].findAll('a', {'class':'map-item'})
        for link in all_teams_link:
            countryCode = link['id']
            country = link['title']
            if 'Ivoire' in country:
                country = 'Ivory Coast'
            string = link['href'].split('/')
            edition = string[3][-4:] #year
            teamID  = string[5][5:]  #teamID
            squad_link = 'http://www.fifa.com/worldcup/archive/edition=' + edition + '/library/teams/team=' + teamID + '/_players/_players_list.html'
            
            req = requests.get(squad_link)
            sou = BeautifulSoup(req.text, 'html.parser')
            
            if ('brazil2014' in token):
                squads = sou.findAll('div', {'class': 'p p-i p-i-prt-1'})
            else:
                squads = sou.findAll('div', {'class': 'p p-i-no'})
            for s in squads:
                Country.append(country)
                CountryCode.append(countryCode)
                Player.append(s['data-player-name'])
                Position.append(s.findAll('span', {'class': 'p-i-fieldpos'})[0].text)
                Age.append(s.findAll('div', {'class': 'p-ag age'})[0].text)
        
        df = pd.DataFrame({
              'Country'       :  Country ,
              'CountryCode'   :  CountryCode ,
              'Player'       :  Player ,
              'Age'         :  Age ,
              'Position'   :  Position
        })
        
        directory = PATH + parse_names(token[17:]) + 'teams/'
        df.to_csv(directory + 'players.csv', index=False)
        
get_player_infos()

In [2]:
a = 5

In [4]:
str(a)

'5'