# Data Collection

In [134]:
import pandas as pd

---

### Results DF

In [135]:
results = pd.read_csv('./data/results.csv')
results.head()
# data from kaggle:  https://www.kaggle.com/martj42/international-football-results-from-1872-to-2017?select=results.csv

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [136]:
results.shape

(43045, 9)

In [137]:
results[results['date'] < '1992-12-31'].shape  # <- 1st date of FIFA rankings

(17489, 9)

In [138]:
results.drop(results[results['date'] < '1992-12-31'].index, inplace=True)

In [139]:
results.shape

(25556, 9)

In [140]:
results.reset_index(inplace=True)

In [141]:
results.drop(columns='index', inplace=True)
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False


---

### Upcoming CONCACAF Fixtures DF

In [142]:
qualifying_fixtures = pd.read_csv('./data/concacaf_wcq_fixtures.csv')
qualifying_fixtures.tail()
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
54,2022-03-29,Panama,Canada,,,FIFA World Cup qualification,,,
55,2022-03-29,Costa Rica,United States,,,FIFA World Cup qualification,,,
56,,,,,,,,,
57,,,,,,,,,
58,,,,,,,,,


In [143]:
qualifying_fixtures[qualifying_fixtures['date'].isnull()]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
56,,,,,,,,,
57,,,,,,,,,
58,,,,,,,,,


In [144]:
qualifying_fixtures.drop(index=qualifying_fixtures[qualifying_fixtures['date'].isnull()].index, inplace=True)

In [145]:
qualifying_fixtures.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,2021-09-02,Canada,Honduras,1.0,1.0,FIFA World Cup qualification,Toronto,Canada,False
1,2021-09-02,Panama,Costa Rica,0.0,0.0,FIFA World Cup qualification,Panama City,Panama,False
2,2021-09-02,Mexico,Jamaica,2.0,1.0,FIFA World Cup qualification,Mexico City,Mexico,False
3,2021-09-02,El Salvador,United States,0.0,0.0,FIFA World Cup qualification,San Salvador,El Salvador,False
4,2021-09-05,Jamaica,Panama,0.0,3.0,FIFA World Cup qualification,Kingston,Jamaica,False


---

### Current Table DF

In [146]:
table = pd.read_csv('./data/current_table_12.14.2021.csv')
table
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,rank,team_name,matches_played,wins,draws,losses,goals_for,goals_against,goal_differential,points
0,1,Canada,8,4,4,0,13,5,8,16
1,2,USA,8,4,3,1,12,5,7,15
2,3,Mexico,8,4,2,2,11,7,4,14
3,4,Panama,8,4,2,2,11,9,2,14
4,5,Costa Rica,8,2,3,3,6,7,-1,9
5,6,Jamaica,8,1,4,3,6,10,-4,7
6,7,El Salvador,8,1,3,4,4,10,-6,6
7,8,Honduras,8,0,3,5,5,15,-10,3


---

### World Rankings DF

In [147]:
past_rankings = pd.read_csv('./data/fifa_ranking_up_to_2021-05-27.csv')
past_rankings.head()
# data from kaggle:  https://www.kaggle.com/cashncarry/fifaworldranking

Unnamed: 0,id,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,43818,82,Iraq,IRQ,15,0,0,AFC,12/31/1992
1,43873,107,Mozambique,MOZ,9,0,0,CAF,12/31/1992
2,43816,108,Indonesia,IDN,9,0,0,AFC,12/31/1992
3,1882218,109,Antigua and Barbuda,ATG,8,0,0,CONCACAF,12/31/1992
4,43820,110,Jordan,JOR,8,0,0,AFC,12/31/1992


In [148]:
past_rankings = past_rankings[['rank', 'country_full', 'rank_date']]
past_rankings.head()

Unnamed: 0,rank,country_full,rank_date
0,82,Iraq,12/31/1992
1,107,Mozambique,12/31/1992
2,108,Indonesia,12/31/1992
3,109,Antigua and Barbuda,12/31/1992
4,110,Jordan,12/31/1992


In [149]:
past_rankings.shape

(63055, 3)

In [150]:
recent_rankings = pd.read_csv('./data/fifa_ranking_2021-08-12_to_2021-11-19.csv')
recent_rankings.head()
# data from FIFA:  https://www.fifa.com/fifa-world-ranking/

Unnamed: 0,rank,country_full,rank_date
0,1.0,Belgium,2021-08-12
1,2.0,Brazil,2021-08-12
2,3.0,France,2021-08-12
3,4.0,England,2021-08-12
4,5.0,Italy,2021-08-12


In [151]:
recent_rankings.shape

(1678, 3)

In [152]:
rankings = pd.concat([past_rankings, recent_rankings])
rankings.shape

(64733, 3)

---

# Combining and Cleaning the Data

### Results Table

In [153]:
rankings.dropna(inplace=True)

In [154]:
results['home_rank'] = 0
results['away_rank'] = 0

results['datetime'] = pd.to_datetime(results['date'])
rankings['datetime'] = pd.to_datetime(rankings['rank_date'])

In [155]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True,0,0,1993-01-01
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False,0,0,1993-01-02
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False,0,0,1993-01-02
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True,0,0,1993-01-03
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False,0,0,1993-01-03


In [156]:
results.shape

(25556, 12)

In [157]:
# Dropping games for teams that are not FIFA members and do not have a FIFA ranking

non_fifa_teams = ['Martinique', 'French Guiana', 'Guadeloupe', 'Guernsey', 'Ynys Môn', 'Isle of Wight', 'Åland Islands',
                  'Jersey', 'Isle of Man', 'Greenland', 'Shetland', 'Basque Country', 'Sint Maarten', 'Zanzibar', 
                  'Canary Islands', 'Frøya', 'Hitra', 'Catalonia', 'Corsica', 'Brittany', 'Palau', 'Gotland', 
                  'Saare County', 'Rhodes', 'Micronesia', 'Andalusia', 'Kernow', 'Saint Martin', 'Orkney', 'Monaco', 
                  'Tuvalu', 'Sark', 'Alderney', 'Kiribati', 'Mayotte', 'Western Isles', 'Falkland Islands', 'Réunion', 
                  'Galicia', 'Northern Cyprus', 'Republic of St. Pauli', 'Găgăuzia', 'Tibet', 'Occitania', 'Sápmi', 
                  'Silesia', 'Northern Mariana Islands', 'Menorca', 'Provence', 'Arameans Suryoye', 'Padania', 
                  'Iraqi Kurdistan', 'Gozo', 'Bonaire', 'Western Sahara', 'Raetia', 'Darfur', 'Tamil Eelam', 'Abkhazia', 
                  'Saint Pierre and Miquelon', 'Artsakh', 'Madrid', 'Vatican City', 'Ellan Vannin', 'South Ossetia', 
                  'County of Nice', 'Székely Land', 'Romani people', 'Felvidék', 'Chagos Islands', 
                  'United Koreans in Japan', 'Somaliland', 'Western Armenia', 'Barawa', 'Kárpátalja', 'Yorkshire', 
                  'Panjab', 'Cascadia', 'Matabeleland', 'Kabylia', 'Parishes of Jersey', 'Saint Helena', 'Chameria', 
                  'Wallis Islands and Futuna', 'Crimea']

for team in non_fifa_teams:
    results.drop(results[results['home_team'].str.contains(team)].index, inplace=True)
    results.drop(results[results['away_team'].str.contains(team)].index, inplace=True)

In [158]:
teams = ['United Arab Emirates', 'Netherlands', 'Guyana', 'Cayman Islands', 'Thailand', 'Sri Lanka','Vietnam', 
         'Latvia', 'Macau', 'Bahrain', 'Algeria', 'Dominican Republic', 'Fiji', 'Slovakia', 'Anguilla', 'Dominica', 
         'Czech Republic', 'Azerbaijan', 'Turkmenistan', 'Gibraltar', 'Georgia', 'North Macedonia', 'Kazakhstan', 
         'Uzbekistan', 'Tajikistan', 'Papua New Guinea', 'Samoa', 'Tonga', 'Montserrat', 'Montenegro', 
         'Serbia and Montenegro', 'Curaçao', 'Cook Islands', 'Guam', 'New Caledonia', 'Bosnia and Herzegovina', 
         'Andorra', 'British Virgin Islands', 'Palestine', 'US Virgin Islands', 'São Tomé and Príncipe', 'Bhutan', 
         'Afghanistan', 'Kosovo', 'South Sudan', 'Moldova', 'Armenia', 'American Samoa', 'Djibouti', 'Eritrea', 
         'Belize', 'Turks and Caicos Islands', 'Comoros']

rank_dates = ['1993-08-08 00:00:00', '1993-08-08 00:00:00', '1993-08-08 00:00:00', '1993-08-08 00:00:00', 
              '1993-08-08 00:00:00', '1993-08-08 00:00:00', '1993-08-08 00:00:00', '1993-08-08 00:00:00', 
              '1993-08-08 00:00:00', '1993-08-08 00:00:00','1993-08-08 00:00:00', '1993-08-08 00:00:00', 
              '1993-08-08 00:00:00', '1993-11-19 00:00:00', '1997-05-14 00:00:00', '1995-02-20 00:00:00', 
              '1994-03-15 00:00:00', '1994-06-14 00:00:00', '1994-10-25 00:00:00', '2016-09-15 00:00:00', 
              '1994-03-15 00:00:00', '1994-05-17 00:00:00', '1994-11-22 00:00:00', '1994-10-25 00:00:00', 
              '1994-11-22 00:00:00', '1996-09-25 00:00:00', '1996-11-20 00:00:00', '1996-11-20 00:00:00', 
              '1999-03-24 00:00:00', '2007-06-13 00:00:00', '2002-12-18 00:00:00', '2007-06-13 00:00:00', 
              '1996-11-20 00:00:00', '1996-08-28 00:00:00', '2004-06-09 00:00:00', '1996-05-22 00:00:00', 
              '1996-11-20 00:00:00', '1997-05-14 00:00:00', '1998-12-23 00:00:00', '1999-03-24 00:00:00', 
              '1998-08-19 00:00:00', '2000-08-09 00:00:00', '2003-01-15 00:00:00', '2016-07-14 00:00:00', 
              '2012-08-08 00:00:00', '1994-05-17 00:00:00', '1994-06-14 00:00:00', '1998-10-21 00:00:00', 
              '1994-12-20 00:00:00', '1998-10-21 00:00:00', '1995-12-19 00:00:00', '1999-03-24 00:00:00', 
              '2006-12-18 00:00:00']

team_and_rankdate = list(zip(teams, rank_dates))

for team in team_and_rankdate:
    results.drop(results[(results['home_team'] == team[0]) & (results['datetime'] < team[1])].index, inplace=True)
    results.drop(results[(results['away_team'] == team[0]) & (results['datetime'] < team[1])].index, inplace=True)

In [159]:
# Ensuring country name matches in rankings and results dataframes

rankings.loc[rankings[rankings['country_full'].str.contains('Congo')].index, 'country_full'] = 'Congo'
results.loc[results[results['home_team'].str.contains('Congo')].index, 'home_team'] = 'Congo'
results.loc[results[results['away_team'].str.contains('Congo')].index, 'away_team'] = 'Congo'

rankings.loc[rankings[rankings['country_full'].str.contains('Swaziland')].index, 'country_full'] = 'Eswatini'
rankings.loc[rankings[rankings['country_full'].str.contains('Eswatini')].index, 'country_full'] = 'Eswatini'
results.loc[results[results['home_team'].str.contains('Eswatini')].index, 'home_team'] = 'Eswatini'
results.loc[results[results['away_team'].str.contains('Eswatini')].index, 'away_team'] = 'Eswatini'

rankings.loc[rankings[rankings['country_full'].str.contains('Korea DPR')].index, 'country_full'] = 'North Korea'
results.loc[results[results['home_team'].str.contains('North Korea')].index, 'home_team'] = 'North Korea'
results.loc[results[results['away_team'].str.contains('North Korea')].index, 'away_team'] = 'North Korea'

rankings.loc[rankings[rankings['country_full'].str.contains('USA')].index, 'country_full'] = 'United States'
results.loc[results[results['home_team'].str.contains('USA')].index, 'home_team'] = 'United States'
results.loc[results[results['away_team'].str.contains('USA')].index, 'away_team'] = 'United States'

rankings.loc[rankings[rankings['country_full'].str.contains('Ivoire')].index, 'country_full'] = 'Ivory Coast'
results.loc[results[results['home_team'].str.contains('Ivoire')].index, 'home_team'] = 'Ivory Coast'
results.loc[results[results['away_team'].str.contains('Ivoire')].index, 'away_team'] = 'Ivory Coast'

rankings.loc[rankings[rankings['country_full'].str.contains('Kitts and Nevis')].index, 'country_full'] = 'St. Kitts and Nevis'
results.loc[results[results['home_team'].str.contains('Kitts and Nevis')].index, 'home_team'] = 'St. Kitts and Nevis'
results.loc[results[results['away_team'].str.contains('Kitts and Nevis')].index, 'away_team'] = 'St. Kitts and Nevis'

rankings.loc[rankings[rankings['country_full'].str.contains('Korea Republic')].index, 'country_full'] = 'South Korea'
results.loc[results[results['home_team'].str.contains('Korea Republic')].index, 'home_team'] = 'South Korea'
results.loc[results[results['away_team'].str.contains('Korea Republic')].index, 'away_team'] = 'South Korea'

rankings.loc[rankings[rankings['country_full'].str.contains('St. Vin')].index, 'country_full'] = 'St. Vincent and the Grenadines'
results.loc[results[results['home_team'].str.contains('Vincent')].index, 'home_team'] = 'St. Vincent and the Grenadines'
results.loc[results[results['away_team'].str.contains('Vincent')].index, 'away_team'] = 'St. Vincent and the Grenadines'

rankings.loc[rankings[rankings['country_full'].str.contains('Iran')].index, 'country_full'] = 'Iran'
results.loc[results[results['home_team'].str.contains('Iran')].index, 'home_team'] = 'Iran'
results.loc[results[results['away_team'].str.contains('Iran')].index, 'away_team'] = 'Iran'

rankings.loc[rankings[rankings['country_full'].str.contains('Kyrgyz')].index, 'country_full'] = 'Kyrgyzstan'
results.loc[results[results['home_team'].str.contains('Kyrgyz')].index, 'home_team'] = 'Kyrgyzstan'
results.loc[results[results['away_team'].str.contains('Kyrgyz')].index, 'away_team'] = 'Kyrgyzstan'

results.loc[results[(results['home_team'] == 'Serbia') & (results['datetime'] < '2006-07-12 00:00:00')].index, 'home_team'] = 'Serbia and Montenegro'
results.loc[results[(results['away_team'] == 'Serbia') & (results['datetime'] < '2006-07-12 00:00:00')].index, 'away_team'] = 'Serbia and Montenegro'

rankings.loc[rankings[rankings['country_full'].str.contains('Lucia')].index, 'country_full'] = 'St. Lucia'
results.loc[results[results['home_team'].str.contains('Lucia')].index, 'home_team'] = 'St. Lucia'
results.loc[results[results['away_team'].str.contains('Lucia')].index, 'away_team'] = 'St. Lucia'

rankings.loc[rankings[rankings['country_full'].str.contains('Netherlands Antilles')].index, 'country_full'] = 'Curaçao'

rankings.loc[rankings[rankings['country_full'].str.contains('Verde')].index, 'country_full'] = 'Cape Verde'

rankings.loc[rankings[rankings['country_full'].str.contains('Brunei')].index, 'country_full'] = 'Brunei'

results.loc[results[results['home_team'].str.contains('United States Virgin Islands')].index, 'home_team'] = 'US Virgin Islands'
results.loc[results[results['away_team'].str.contains('United States Virgin Islands')].index, 'away_team'] = 'US Virgin Islands'

rankings.loc[rankings[rankings['country_full'].str.contains('Timor-Leste')].index, 'country_full'] = 'East Timor'

results.loc[results[results['home_team'].str.contains('Taiwan')].index, 'home_team'] = 'Chinese Taipei'
results.loc[results[results['away_team'].str.contains('Taiwan')].index, 'away_team'] = 'Chinese Taipei'

results.loc[results[results['home_team'].str.contains('Timor-Leste')].index, 'home_team'] = 'East Timor'
results.loc[results[results['away_team'].str.contains('Timor-Leste')].index, 'away_team'] = 'East Timor'

In [160]:
results_dict = results.to_dict('records')
results_dict[0]

{'date': '1993-01-01',
 'home_team': 'Ghana',
 'away_team': 'Mali',
 'home_score': 1.0,
 'away_score': 1.0,
 'tournament': 'Friendly',
 'city': 'Libreville',
 'country': 'Gabon',
 'neutral': True,
 'home_rank': 0,
 'away_rank': 0,
 'datetime': Timestamp('1993-01-01 00:00:00')}

In [161]:
rankings_dict = rankings.to_dict('records')
rankings_dict[0:2]

[{'rank': 82.0,
  'country_full': 'Iraq',
  'rank_date': '12/31/1992',
  'datetime': Timestamp('1992-12-31 00:00:00')},
 {'rank': 107.0,
  'country_full': 'Mozambique',
  'rank_date': '12/31/1992',
  'datetime': Timestamp('1992-12-31 00:00:00')}]

In [162]:
def home_rank(row):
    try:
        hometeam = row['home_team']
        matchday = row['datetime']

        hometeam_rankings = []
        for i in rankings_dict:
            if i['country_full'] == hometeam:
                hometeam_rankings.append(i)

        previous_rankings = []
        for i in hometeam_rankings:
            if i['datetime'] <= matchday:
                previous_rankings.append(i)
        # print(hometeam), print(matchday)

        return previous_rankings[-1]['rank']

    except:
        print(hometeam), print(matchday)
        return -1

In [163]:
results['home_rank'] = results.apply(home_rank, axis=1)

Serbia and Montenegro
1995-03-31 00:00:00
Serbia and Montenegro
1995-05-31 00:00:00
Serbia and Montenegro
1996-03-27 00:00:00
Serbia and Montenegro
1996-04-24 00:00:00
Serbia and Montenegro
1996-06-02 00:00:00
Serbia and Montenegro
1996-11-10 00:00:00
Serbia and Montenegro
1997-03-12 00:00:00
Serbia and Montenegro
1997-04-30 00:00:00
Serbia and Montenegro
1997-06-08 00:00:00
Serbia and Montenegro
1997-11-15 00:00:00
US Virgin Islands
1998-03-22 00:00:00
Serbia and Montenegro
1998-04-22 00:00:00
Serbia and Montenegro
1998-05-29 00:00:00
Serbia and Montenegro
1998-06-03 00:00:00
Serbia and Montenegro
1998-06-14 00:00:00
Serbia and Montenegro
1998-09-02 00:00:00
Serbia and Montenegro
1998-11-18 00:00:00
Serbia and Montenegro
1999-06-08 00:00:00
Serbia and Montenegro
1999-08-18 00:00:00
Serbia and Montenegro
1999-09-05 00:00:00
Serbia and Montenegro
2000-03-28 00:00:00
Serbia and Montenegro
2000-06-13 00:00:00
Serbia and Montenegro
2000-06-21 00:00:00
Serbia and Montenegro
2001-01-14 00:00

In [164]:
results[results['home_rank'] == -1].shape
# Serbia & Montenegro and US Virgin Islands errors fixed after running drops twice???

(33, 12)

In [165]:
def away_rank(row):
    try:
        awayteam = row['away_team']
        matchday = row['datetime']

        awayteam_rankings = []
        for i in rankings_dict:
            if i['country_full'] == awayteam:
                awayteam_rankings.append(i)

        previous_rankings = []
        for i in awayteam_rankings:
            if i['datetime'] <= matchday:
                previous_rankings.append(i)
        # print(hometeam), print(matchday)

        return previous_rankings[-1]['rank']

    except:
        print(awayteam), print(matchday)
        return -1

In [166]:
results['away_rank'] = results.apply(away_rank, axis=1)

Serbia and Montenegro
1994-12-23 00:00:00
Serbia and Montenegro
1994-12-27 00:00:00
Serbia and Montenegro
1995-02-04 00:00:00
Serbia and Montenegro
1995-09-20 00:00:00
Serbia and Montenegro
1995-11-12 00:00:00
Serbia and Montenegro
1995-11-16 00:00:00
Serbia and Montenegro
1996-05-23 00:00:00
Serbia and Montenegro
1996-05-26 00:00:00
Serbia and Montenegro
1996-10-06 00:00:00
Serbia and Montenegro
1996-12-14 00:00:00
Serbia and Montenegro
1996-12-28 00:00:00
Serbia and Montenegro
1997-02-07 00:00:00
Serbia and Montenegro
1997-04-02 00:00:00
Serbia and Montenegro
1997-06-12 00:00:00
Serbia and Montenegro
1997-06-14 00:00:00
Serbia and Montenegro
1997-06-16 00:00:00
Serbia and Montenegro
1997-08-20 00:00:00
Serbia and Montenegro
1997-09-10 00:00:00
Serbia and Montenegro
1997-10-11 00:00:00
Serbia and Montenegro
1997-10-29 00:00:00
Serbia and Montenegro
1998-01-28 00:00:00
Serbia and Montenegro
1998-02-24 00:00:00
Serbia and Montenegro
1998-03-25 00:00:00
Serbia and Montenegro
1998-06-06 0

In [167]:
results[results['away_rank'] == -1].shape
# Serbia & Montenegro and US Virgin Islands errors fixed after running drops twice???

(61, 12)

In [168]:
#  rankings[rankings['country_full'].str.contains('Serbia and Montenegro')]
#  checks for errors when assigning ranks

In [169]:
#  results[(results['home_team'] == 'Serbia and Montenegro') & (results['datetime'] < '2002-12-18 00:00:00')]
#  checks for errors when assigning ranks

In [170]:
#  results[results['away_team'].str.contains('Wallis Islands and Futuna')]
#  checks for errors when assigning ranks

In [171]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True,39.0,69.0,1993-01-01
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False,55.0,97.0,1993-01-02
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False,71.0,161.0,1993-01-02
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True,97.0,69.0,1993-01-03
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False,55.0,39.0,1993-01-03


In [172]:
results['neutral'] = results['neutral'].map({True: 1, False: 0})

In [173]:
results.isnull().sum()

date          0
home_team     0
away_team     0
home_score    1
away_score    1
tournament    0
city          0
country       0
neutral       0
home_rank     0
away_rank     0
datetime      0
dtype: int64

In [174]:
results[results['home_score'].isnull()]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
25555,2021-11-19,Sri Lanka,Seychelles,,,Mahinda Rajapaksa Cup,Columbo,Sri Lanka,0,204.0,197.0,2021-11-19


In [175]:
results.dropna(inplace=True)

In [176]:
results['home_score'] = results['home_score'].astype(int)
results['away_score'] = results['away_score'].astype(int)
results['home_rank'] = results['home_rank'].astype(int)
results['away_rank'] = results['away_rank'].astype(int)

In [177]:
results['home_team'].nunique()

211

In [178]:
results['away_team'].nunique()

211

In [179]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
0,1993-01-01,Ghana,Mali,1,1,Friendly,Libreville,Gabon,1,39,69,1993-01-01
1,1993-01-02,Gabon,Burkina Faso,1,1,Friendly,Libreville,Gabon,0,55,97,1993-01-02
2,1993-01-02,Kuwait,Lebanon,2,0,Friendly,Kuwait City,Kuwait,0,71,161,1993-01-02
3,1993-01-03,Burkina Faso,Mali,1,0,Friendly,Libreville,Gabon,1,97,69,1993-01-03
4,1993-01-03,Gabon,Ghana,2,3,Friendly,Libreville,Gabon,0,55,39,1993-01-03


In [180]:
# dropping games with more than 10 goals scored by one team (decided after EDA after seeing heavy outliers)
results.drop(results[results['home_score'] > 10].index, inplace=True)
results.drop(results[results['away_score'] > 10].index, inplace=True)

In [181]:
results.drop(columns=['city', 'country'], inplace=True)  # neutral column covers this information

In [182]:
results = results.rename(columns={'tournament': 'match_type'})

In [183]:
results.loc[results[results['match_type'] == 'FIFA World Cup qualification'].index, 'match_type'] = 'FIFA_WCQ'
results.loc[results[results['match_type'].str.contains('FIFA World Cup')].index, 'match_type'] = 'FIFA_WC'
results.loc[results[results['match_type'].str.contains('qualification')].index, 'match_type'] = 'Qualifier'
results.loc[results[results['match_type'].str.contains('Cup')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('Copa')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('Tournament')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('Championship')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('League')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('Games')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('UEFA Euro')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('Festival')].index, 'match_type'] = 'Tournament/Cup_Match'
results.loc[results[results['match_type'].str.contains('Tournoi')].index, 'match_type'] = 'Tournament/Cup_Match'

In [184]:
results['match_type'].value_counts()

Friendly                8589
FIFA_WCQ                5629
Tournament/Cup_Match    5116
Qualifier               4093
FIFA_WC                  436
Name: match_type, dtype: int64

In [185]:
results = pd.get_dummies(results, columns=['match_type'], drop_first=True)

In [186]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,neutral,home_rank,away_rank,datetime,match_type_FIFA_WCQ,match_type_Friendly,match_type_Qualifier,match_type_Tournament/Cup_Match
0,1993-01-01,Ghana,Mali,1,1,1,39,69,1993-01-01,0,1,0,0
1,1993-01-02,Gabon,Burkina Faso,1,1,0,55,97,1993-01-02,0,1,0,0
2,1993-01-02,Kuwait,Lebanon,2,0,0,71,161,1993-01-02,0,1,0,0
3,1993-01-03,Burkina Faso,Mali,1,0,1,97,69,1993-01-03,0,1,0,0
4,1993-01-03,Gabon,Ghana,2,3,0,55,39,1993-01-03,0,1,0,0


In [187]:
results.drop(results[(results['datetime'] > '2021-09-01 00:00:00')].index, inplace=True)
# These results contain matches we are looking to predict

In [188]:
results.to_csv('./data/cleaned_results_and_rankings.csv', index=False)

---

### Upcoming Fixtures

In [189]:
qualifying_fixtures.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,2021-09-02,Canada,Honduras,1.0,1.0,FIFA World Cup qualification,Toronto,Canada,False
1,2021-09-02,Panama,Costa Rica,0.0,0.0,FIFA World Cup qualification,Panama City,Panama,False
2,2021-09-02,Mexico,Jamaica,2.0,1.0,FIFA World Cup qualification,Mexico City,Mexico,False
3,2021-09-02,El Salvador,United States,0.0,0.0,FIFA World Cup qualification,San Salvador,El Salvador,False
4,2021-09-05,Jamaica,Panama,0.0,3.0,FIFA World Cup qualification,Kingston,Jamaica,False


In [190]:
qualifying_fixtures['home_rank'] = 0
qualifying_fixtures['away_rank'] = 0

qualifying_fixtures['datetime'] = pd.to_datetime(qualifying_fixtures['date'])

In [191]:
qualifying_fixtures['home_rank'] = qualifying_fixtures.apply(home_rank, axis=1)

In [192]:
qualifying_fixtures['away_rank'] = qualifying_fixtures.apply(away_rank, axis=1)

In [193]:
qualifying_fixtures['datetime'].head()

0   2021-09-02
1   2021-09-02
2   2021-09-02
3   2021-09-02
4   2021-09-05
Name: datetime, dtype: datetime64[ns]

In [194]:
qualifying_fixtures[qualifying_fixtures['home_score'].isnull()]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
32,2022-01-26,Jamaica,Mexico,,,FIFA World Cup qualification,,,,57.0,14.0,2022-01-26
33,2022-01-26,Honduras,Canada,,,FIFA World Cup qualification,,,,76.0,40.0,2022-01-26
34,2022-01-26,United States,El Salvador,,,FIFA World Cup qualification,,,,123.0,69.0,2022-01-26
35,2022-01-26,Costa Rica,Panama,,,FIFA World Cup qualification,,,,49.0,63.0,2022-01-26
36,2022-01-29,Honduras,El Salvador,,,FIFA World Cup qualification,,,,76.0,69.0,2022-01-29
37,2022-01-29,Panama,Jamaica,,,FIFA World Cup qualification,,,,63.0,57.0,2022-01-29
38,2022-01-29,Canada,United States,,,FIFA World Cup qualification,,,,40.0,123.0,2022-01-29
39,2022-01-29,Mexico,Costa Rica,,,FIFA World Cup qualification,,,,14.0,49.0,2022-01-29
40,2022-02-01,Jamaica,Costa Rica,,,FIFA World Cup qualification,,,,57.0,49.0,2022-02-01
41,2022-02-01,United States,Honduras,,,FIFA World Cup qualification,,,,123.0,76.0,2022-02-01
