# Data Collection

In [559]:
import pandas as pd

---

### Results DF

In [560]:
results = pd.read_csv('./data/results.csv')
results.head()
# data from kaggle:  https://www.kaggle.com/martj42/international-football-results-from-1872-to-2017?select=results.csv

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [561]:
results.shape

(43045, 9)

In [562]:
results[results['date'] < '1992-12-31'].shape  # <- 1st date of FIFA rankings

(17489, 9)

In [563]:
results.drop(results[results['date'] < '1992-12-31'].index, inplace=True)

In [564]:
results.shape

(25556, 9)

In [565]:
results.reset_index(inplace=True)

In [566]:
results.drop(columns='index', inplace=True)
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False


---

### Upcoming Fixtures DF

In [567]:
qualifying_fixtures = pd.read_csv('./data/concacaf_wcq_fixtures.csv')
qualifying_fixtures.tail()
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
54,2022-03-29,Panama,Canada,,,FIFA World Cup qualification,,,
55,2022-03-29,Costa Rica,United States,,,FIFA World Cup qualification,,,
56,,,,,,,,,
57,,,,,,,,,
58,,,,,,,,,


In [568]:
qualifying_fixtures[qualifying_fixtures['date'].isnull()]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
56,,,,,,,,,
57,,,,,,,,,
58,,,,,,,,,


In [569]:
qualifying_fixtures.drop(index=qualifying_fixtures[qualifying_fixtures['date'].isnull()].index, inplace=True)

---

### Current Table DF

In [570]:
table = pd.read_csv('./data/current_table_12.14.2021.csv')
table
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,rank,team_name,matches_played,wins,draws,losses,goals_for,goals_against,goal_differential,points
0,1,Canada,8,4,4,0,13,5,8,16
1,2,USA,8,4,3,1,12,5,7,15
2,3,Mexico,8,4,2,2,11,7,4,14
3,4,Panama,8,4,2,2,11,9,2,14
4,5,Costa Rica,8,2,3,3,6,7,-1,9
5,6,Jamaica,8,1,4,3,6,10,-4,7
6,7,El Salvador,8,1,3,4,4,10,-6,6
7,8,Honduras,8,0,3,5,5,15,-10,3


---

### World Rankings DF

In [571]:
past_rankings = pd.read_csv('./data/fifa_ranking-2021-05-27.csv')
past_rankings.head()
# data from kaggle:  https://www.kaggle.com/cashncarry/fifaworldranking

Unnamed: 0,id,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,43818.0,82,Iraq,IRQ,15.0,0.0,0.0,AFC,12/31/1992
1,43873.0,107,Mozambique,MOZ,9.0,0.0,0.0,CAF,12/31/1992
2,43816.0,108,Indonesia,IDN,9.0,0.0,0.0,AFC,12/31/1992
3,1882218.0,109,Antigua and Barbuda,ATG,8.0,0.0,0.0,CONCACAF,12/31/1992
4,43820.0,110,Jordan,JOR,8.0,0.0,0.0,AFC,12/31/1992


In [572]:
past_rankings = past_rankings[['rank', 'country_full', 'rank_date']]
past_rankings.head()

Unnamed: 0,rank,country_full,rank_date
0,82,Iraq,12/31/1992
1,107,Mozambique,12/31/1992
2,108,Indonesia,12/31/1992
3,109,Antigua and Barbuda,12/31/1992
4,110,Jordan,12/31/1992


In [573]:
past_rankings.shape

(63056, 3)

In [574]:
recent_rankings = pd.read_csv('./data/fifa_ranking_2021-11-19.csv')
recent_rankings.head()
# data from FIFA:  https://www.fifa.com/fifa-world-ranking/

Unnamed: 0,rank,country_full,rank_date
0,1.0,Belgium,2021-08-12
1,2.0,Brazil,2021-08-12
2,3.0,France,2021-08-12
3,4.0,England,2021-08-12
4,5.0,Italy,2021-08-12


In [575]:
recent_rankings.shape

(1678, 3)

In [576]:
rankings = pd.concat([past_rankings, recent_rankings])
rankings.shape

(64734, 3)

---

### Combining and Cleaning the Data

In [577]:
rankings.dropna(inplace=True)

In [588]:
results['home_rank'] = 0

In [589]:
results['away_rank'] = 0

In [590]:
results['datetime'] = pd.to_datetime(results['date'])
rankings['datetime'] = pd.to_datetime(rankings['rank_date'])

In [591]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True,0,0,1993-01-01
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False,0,0,1993-01-02
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False,0,0,1993-01-02
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True,0,0,1993-01-03
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False,0,0,1993-01-03


In [611]:
results.shape

(25548, 12)

In [776]:
results.drop(results[(results['home_team'] == 'United Arab Emirates') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 8 rows of games for UAE because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[(results['home_team'] == 'Netherlands') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 3 rows of games for Netherlands because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[results['home_team'] == 'Martinique'].index, inplace=True)
# Martinique is not a member of FIFA and therefore will have no rankings

results.drop(results[(results['home_team'] == 'Guyana') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 3 rows of games for Guyana because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[(results['home_team'] == 'Cayman Islands') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 1 row of games for Cayman Islands because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[results['home_team'] == 'French Guiana'].index, inplace=True)
# French Guiana is not a member of FIFA and therefore will have no rankings

results.drop(results[(results['home_team'] == 'Slovakia') & (results['datetime'] < '1993-11-19 00:00:00')].index, inplace=True)
# dropping 2 rows of games for Slovakia because they do not have a rank prior to 1993-11-19 in the dataframe

results.drop(results[(results['home_team'] == 'Thailand') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 5 rows of games for Thailand because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[(results['home_team'] == 'Anguilla') & (results['datetime'] < '1997-05-14 00:00:00')].index, inplace=True)
# dropping 3 rows of games for Angiulla because they do not have a rank prior to 1997-05-14 in the dataframe

results.drop(results[(results['home_team'] == 'Sri Lanka') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 4 rows of games for Sri Lanka because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[(results['home_team'] == 'Vietnam') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 4 rows of games for Vietnam because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[results['home_team'] == 'Guadeloupe'].index, inplace=True)
# Guadeloupe is not a member of FIFA and therefore will have no rankings

results.drop(results[(results['home_team'] == 'Latvia') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 5 rows of games for Latvia because they do not have a rank prior to 1993-08-08 in the dataframe

results.drop(results[(results['home_team'] == 'Dominica') & (results['datetime'] < '1995-02-20 00:00:00')].index, inplace=True)
# dropping 3 rows of games for Dominica because they do not have a rank prior to 1995-02-20 in the dataframe

results.drop(results[(results['home_team'] == 'Czech Republic') & (results['datetime'] < '1994-03-15 00:00:00')].index, inplace=True)
# dropping 3 rows of games for Czech Republic because they do not have a rank prior to 1994-03-15 in the dataframe

results.drop(results[(results['home_team'] == 'Macau') & (results['datetime'] < '1993-08-08 00:00:00')].index, inplace=True)
# dropping 3 rows of games for Latvia because they do not have a rank prior to 1993-08-08 in the dataframe

In [777]:
# Ensuring country name matches in rankings and results dataframes

rankings.loc[rankings[rankings['country_full'].str.contains('Congo')].index, 'country_full'] = 'Congo'
results.loc[results[results['home_team'].str.contains('Congo')].index, 'home_team'] = 'Congo'

rankings.loc[rankings[rankings['country_full'].str.contains('Swaziland')].index, 'country_full'] = 'Eswatini'
rankings.loc[rankings[rankings['country_full'].str.contains('Eswatini')].index, 'country_full'] = 'Eswatini'
results.loc[results[results['home_team'].str.contains('Eswatini')].index, 'home_team'] = 'Eswatini'

rankings.loc[rankings[rankings['country_full'].str.contains('Korea DPR')].index, 'country_full'] = 'North Korea'
results.loc[results[results['home_team'].str.contains('North Korea')].index, 'home_team'] = 'North Korea'

rankings.loc[rankings[rankings['country_full'].str.contains('USA')].index, 'country_full'] = 'United States'
results.loc[results[results['home_team'].str.contains('USA')].index, 'home_team'] = 'United States'

rankings.loc[rankings[rankings['country_full'].str.contains('Ivoire')].index, 'country_full'] = 'Ivory Coast'
results.loc[results[results['home_team'].str.contains('Ivoire')].index, 'home_team'] = 'Ivory Coast'

rankings.loc[rankings[rankings['country_full'].str.contains('Kitts and Nevis')].index, 'country_full'] = 'St. Kitts and Nevis'
results.loc[results[results['home_team'].str.contains('Kitts and Nevis')].index, 'home_team'] = 'St. Kitts and Nevis'

rankings.loc[rankings[rankings['country_full'].str.contains('Korea Republic')].index, 'country_full'] = 'South Korea'
results.loc[results[results['home_team'].str.contains('Korea Republic')].index, 'home_team'] = 'South Korea'

rankings.loc[rankings[rankings['country_full'].str.contains('St. Vin')].index, 'country_full'] = 'St. Vincent and the Grenadines'
results.loc[results[results['home_team'].str.contains('Vincent')].index, 'home_team'] = 'St. Vincent and the Grenadines'

In [778]:
results_dict = results.to_dict('records')
results_dict[0]

{'date': '1993-01-01',
 'home_team': 'Ghana',
 'away_team': 'Mali',
 'home_score': 1.0,
 'away_score': 1.0,
 'tournament': 'Friendly',
 'city': 'Libreville',
 'country': 'Gabon',
 'neutral': True,
 'home_rank': 0,
 'away_rank': 0,
 'datetime': Timestamp('1993-01-01 00:00:00')}

In [779]:
rankings_dict = rankings.to_dict('records')
rankings_dict[0:2]

[{'rank': 82.0,
  'country_full': 'Iraq',
  'rank_date': '12/31/1992',
  'datetime': Timestamp('1992-12-31 00:00:00')},
 {'rank': 107.0,
  'country_full': 'Mozambique',
  'rank_date': '12/31/1992',
  'datetime': Timestamp('1992-12-31 00:00:00')}]

In [780]:
# row = results_dict[-1]
def home_rank(row):
    hometeam = row['home_team']
    hometeam

    matchday = row['datetime']
    matchday

    hometeam_rankings = []
    for i in rankings_dict:
        if i['country_full'] == hometeam:
            hometeam_rankings.append(i)

    hometeam_rankings

    previous_rankings = []
    for i in hometeam_rankings:
        if i['datetime'] <= matchday:
            previous_rankings.append(i)
    print(hometeam), print(matchday), print(previous_rankings)
    previous_rankings

    return previous_rankings[-1]['rank']

In [781]:
results.apply(home_rank, axis=1)

Ghana
1993-01-01 00:00:00
[{'rank': 39.0, 'country_full': 'Ghana', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Gabon
1993-01-02 00:00:00
[{'rank': 55.0, 'country_full': 'Gabon', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Kuwait
1993-01-02 00:00:00
[{'rank': 71.0, 'country_full': 'Kuwait', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Burkina Faso
1993-01-03 00:00:00
[{'rank': 97.0, 'country_full': 'Burkina Faso', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Gabon
1993-01-03 00:00:00
[{'rank': 55.0, 'country_full': 'Gabon', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Uganda
1993-01-08 00:00:00
[{'rank': 92.0, 'country_full': 'Uganda', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Sierra Leone
1993-01-09 00:00:00
[{'rank': 79.0, 'country_full': 'Sierra Leone', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-

Costa Rica
1993-02-19 00:00:00
[{'rank': 37.0, 'country_full': 'Costa Rica', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Finland
1993-02-20 00:00:00
[{'rank': 44.0, 'country_full': 'Finland', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Estonia
1993-02-21 00:00:00
[{'rank': 132.0, 'country_full': 'Estonia', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Finland
1993-02-21 00:00:00
[{'rank': 44.0, 'country_full': 'Finland', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
United States
1993-02-21 00:00:00
[{'rank': 106.0, 'country_full': 'United States', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}, {'rank': 24.0, 'country_full': 'United States', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Argentina
1993-02-24 00:00:00
[{'rank': 10.0, 'country_full': 'Argentina', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-

Algeria
1993-04-09 00:00:00
[{'rank': 30.0, 'country_full': 'Algeria', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Egypt
1993-04-09 00:00:00
[{'rank': 21.0, 'country_full': 'Egypt', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
North Korea
1993-04-09 00:00:00
[{'rank': 7.0, 'country_full': 'North Korea', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}, {'rank': 77.0, 'country_full': 'North Korea', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Oman
1993-04-09 00:00:00
[{'rank': 116.0, 'country_full': 'Oman', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Qatar
1993-04-09 00:00:00
[{'rank': 52.0, 'country_full': 'Qatar', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
South Africa
1993-04-10 00:00:00
[{'rank': 124.0, 'country_full': 'South Africa', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Canad

Republic of Ireland
1993-04-28 00:00:00
[{'rank': 6.0, 'country_full': 'Republic of Ireland', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
South Korea
1993-04-28 00:00:00
[{'rank': 99.0, 'country_full': 'South Korea', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}, {'rank': 49.0, 'country_full': 'South Korea', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Norway
1993-04-28 00:00:00
[{'rank': 14.0, 'country_full': 'Norway', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Poland
1993-04-28 00:00:00
[{'rank': 20.0, 'country_full': 'Poland', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Portugal
1993-04-28 00:00:00
[{'rank': 33.0, 'country_full': 'Portugal', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-31 00:00:00')}]
Russia
1993-04-28 00:00:00
[{'rank': 8.0, 'country_full': 'Russia', 'rank_date': '12/31/1992', 'datetime': Timestamp('1992-12-

IndexError: list index out of range

In [774]:
rankings[rankings['country_full'].str.contains('Macau')]

Unnamed: 0,rank,country_full,rank_date,datetime
224,167.0,Macau,8/8/1993,1993-08-08
388,166.0,Macau,9/23/1993,1993-09-23
554,166.0,Macau,10/22/1993,1993-10-22
719,167.0,Macau,11/19/1993,1993-11-19
936,166.0,Macau,12/23/1993,1993-12-23
...,...,...,...,...
62952,182.0,Macau,5/27/2021,2021-05-27
182,183.0,Macau,2021-08-12,2021-08-12
391,182.0,Macau,2021-09-16,2021-09-16
601,182.0,Macau,2021-10-21,2021-10-21


In [775]:
results[(results['home_team'] == 'Macau') & (results['datetime'] < '1993-08-08 00:00:00')]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
269,1993-05-01,Macau,Saudi Arabia,0.0,6.0,FIFA World Cup qualification,Kuala Lumpur,Malaysia,True,0,0,1993-05-01
279,1993-05-03,Macau,Kuwait,1.0,10.0,FIFA World Cup qualification,Kuala Lumpur,Malaysia,True,0,0,1993-05-03
311,1993-05-18,Macau,Malaysia,0.0,5.0,FIFA World Cup qualification,Ta'if,Saudi Arabia,True,0,0,1993-05-18


In [723]:
results[results['home_team'] == 'Guadeloupe'].shape

(68, 12)