# Data Collection

In [17]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup

---

### Results DF

In [18]:
results = pd.read_csv('./data/results.csv')
results.head()
# data from kaggle:  https://www.kaggle.com/martj42/international-football-results-from-1872-to-2017?select=results.csv

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [19]:
results.shape

(43045, 9)

In [165]:
results[results['date'] < '1992-12-31'].shape  # <- 1st date of FIFA rankings

(17489, 12)

In [167]:
results.drop(results[results['date'] < '1992-12-31'].index, inplace=True)

In [168]:
results.shape

(25556, 12)

In [218]:
results.reset_index(inplace=True)

In [220]:
results.drop(columns='index', inplace=True)
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,datetime,home_rank,away_rank
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True,1993-01-01,0,0
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False,1993-01-02,0,0
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False,1993-01-02,0,0
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True,1993-01-03,0,0
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False,1993-01-03,0,0


---

### Upcoming Fixtures DF

In [221]:
qualifying_fixtures = pd.read_csv('./data/concacaf_wcq_fixtures.csv')
qualifying_fixtures.head()
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,2021-09-02,Canada,Honduras,1.0,1.0,FIFA World Cup qualification,Toronto,Canada,False
1,2021-09-02,Panama,Costa Rica,0.0,0.0,FIFA World Cup qualification,Panama City,Panama,False
2,2021-09-02,Mexico,Jamaica,2.0,1.0,FIFA World Cup qualification,Mexico City,Mexico,False
3,2021-09-02,El Salvador,United States,0.0,0.0,FIFA World Cup qualification,San Salvador,El Salvador,False
4,2021-09-05,Jamaica,Panama,0.0,3.0,FIFA World Cup qualification,Kingston,Jamaica,False


---

### Current Table DF

In [222]:
table = pd.read_csv('./data/current_table_12.14.2021.csv')
table
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,rank,team_name,matches_played,wins,draws,losses,goals_for,goals_against,goal_differential,points
0,1,Canada,8,4,4,0,13,5,8,16
1,2,USA,8,4,3,1,12,5,7,15
2,3,Mexico,8,4,2,2,11,7,4,14
3,4,Panama,8,4,2,2,11,9,2,14
4,5,Costa Rica,8,2,3,3,6,7,-1,9
5,6,Jamaica,8,1,4,3,6,10,-4,7
6,7,El Salvador,8,1,3,4,4,10,-6,6
7,8,Honduras,8,0,3,5,5,15,-10,3


---

### World Rankings DF

In [223]:
past_rankings = pd.read_csv('./data/fifa_ranking-2021-05-27.csv')
past_rankings.head()
# data from kaggle:  https://www.kaggle.com/cashncarry/fifaworldranking

Unnamed: 0,id,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,43818,82,Iraq,IRQ,15,0,0,AFC,1992-12-31
1,43873,107,Mozambique,MOZ,9,0,0,CAF,1992-12-31
2,43816,108,Indonesia,IDN,9,0,0,AFC,1992-12-31
3,1882218,109,Antigua and Barbuda,ATG,8,0,0,CONCACAF,1992-12-31
4,43820,110,Jordan,JOR,8,0,0,AFC,1992-12-31


In [224]:
past_rankings = rankings[['rank', 'country_full', 'rank_date']]
past_rankings.head()

Unnamed: 0,rank,country_full,rank_date
0,82.0,Iraq,1992-12-31
1,107.0,Mozambique,1992-12-31
2,108.0,Indonesia,1992-12-31
3,109.0,Antigua and Barbuda,1992-12-31
4,110.0,Jordan,1992-12-31


In [225]:
past_rankings.shape

(64734, 3)

In [226]:
recent_rankings = pd.read_csv('./data/fifa_ranking_2021-11-19.csv')
recent_rankings.head()
# data from FIFA:  https://www.fifa.com/fifa-world-ranking/

Unnamed: 0,rank,country_full,rank_date
0,1.0,Belgium,2021-08-12
1,2.0,Brazil,2021-08-12
2,3.0,France,2021-08-12
3,4.0,England,2021-08-12
4,5.0,Italy,2021-08-12


In [227]:
recent_rankings.shape

(1678, 3)

In [228]:
rankings = pd.concat([past_rankings, recent_rankings])
rankings.shape

(66412, 3)

---

### Combining the Data

In [229]:
rankings.dropna(inplace=True)

In [230]:
Colombia_rankings = rankings[rankings['country_full'] == 'Colombia'].copy()

In [231]:
results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,datetime,home_rank,away_rank
25551,2021-11-16,Venezuela,Peru,1.0,2.0,FIFA World Cup qualification,Caracas,Venezuela,False,2021-11-16,0,0
25552,2021-11-16,Colombia,Paraguay,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-11-16,0,0
25553,2021-11-16,Argentina,Brazil,0.0,0.0,FIFA World Cup qualification,San Juan,Argentina,False,2021-11-16,0,0
25554,2021-11-16,Chile,Ecuador,0.0,2.0,FIFA World Cup qualification,Santiago,Chile,False,2021-11-16,0,0
25555,2021-11-19,Sri Lanka,Seychelles,,,Mahinda Rajapaksa Cup,Columbo,Sri Lanka,False,2021-11-19,0,0


In [232]:
Colombia_results = results[results['home_team'] == 'Colombia'].copy()

In [233]:
Colombia_results['datetime'] = pd.to_datetime(Colombia_results['date'])
Colombia_rankings['datetime'] = pd.to_datetime(Colombia_rankings['rank_date'])

In [234]:
Colombia_rankings.tail()

Unnamed: 0,rank,country_full,rank_date,datetime
645,16.0,Colombia,2021-11-19,2021-11-19
14,15.0,Colombia,2021-08-12,2021-08-12
225,16.0,Colombia,2021-09-16,2021-09-16
435,16.0,Colombia,2021-10-21,2021-10-21
645,16.0,Colombia,2021-11-19,2021-11-19


In [235]:
Colombia_results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,datetime,home_rank,away_rank
24950,2021-06-20,Colombia,Peru,1.0,2.0,Copa América,Goiânia,Brazil,True,2021-06-20,0,0
25242,2021-09-09,Colombia,Chile,3.0,1.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-09-09,0,0
25341,2021-10-10,Colombia,Brazil,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-10-10,0,0
25404,2021-10-14,Colombia,Ecuador,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-10-14,0,0
25552,2021-11-16,Colombia,Paraguay,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-11-16,0,0


In [237]:
Colombia_rankings[Colombia_rankings.index == 62906]

Unnamed: 0,rank,country_full,rank_date,datetime
62906,15.0,Colombia,2021-05-27,2021-05-27


In [239]:
# Colombia_results.loc[Colombia_results.index==42439, 'home_rank'] = Colombia_rankings[Colombia_rankings['datetime'] < Colombia_results.loc[42439, 'datetime']].tail(1)['rank'].values[0]

In [240]:
results['home_rank'] = 0

In [241]:
results['away_rank'] = 0

In [242]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,datetime,home_rank,away_rank
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True,1993-01-01,0,0
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False,1993-01-02,0,0
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False,1993-01-02,0,0
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True,1993-01-03,0,0
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False,1993-01-03,0,0


In [243]:
results_dict = results.to_dict('records')
results_dict[0:2]

[{'date': '1993-01-01',
  'home_team': 'Ghana',
  'away_team': 'Mali',
  'home_score': 1.0,
  'away_score': 1.0,
  'tournament': 'Friendly',
  'city': 'Libreville',
  'country': 'Gabon',
  'neutral': True,
  'datetime': Timestamp('1993-01-01 00:00:00'),
  'home_rank': 0,
  'away_rank': 0},
 {'date': '1993-01-02',
  'home_team': 'Gabon',
  'away_team': 'Burkina Faso',
  'home_score': 1.0,
  'away_score': 1.0,
  'tournament': 'Friendly',
  'city': 'Libreville',
  'country': 'Gabon',
  'neutral': False,
  'datetime': Timestamp('1993-01-02 00:00:00'),
  'home_rank': 0,
  'away_rank': 0}]

In [244]:
rankings_dict = rankings.to_dict('records')
rankings_dict[0:2]

[{'rank': 82.0, 'country_full': 'Iraq', 'rank_date': '1992-12-31'},
 {'rank': 107.0, 'country_full': 'Mozambique', 'rank_date': '1992-12-31'}]

In [198]:
Colombia_rankings[Colombia_rankings['datetime'] < Colombia_results.loc[42439, 'datetime']].tail(1)['rank'].values

array([15.])

In [247]:
results['datetime'] = pd.to_datetime(results['date'])
rankings['datetime'] = pd.to_datetime(rankings['rank_date'])

In [259]:
results.iloc[0]

date                   1993-01-01
home_team                   Ghana
away_team                    Mali
home_score                    1.0
away_score                    1.0
tournament               Friendly
city                   Libreville
country                     Gabon
neutral                      True
datetime      1993-01-01 00:00:00
home_rank                       0
away_rank                       0
Name: 0, dtype: object

In [275]:
def home_rank():
    rankings_dates = []
    ranks = []
    for g in results:
        hometeam = results['home_team']
        matchday = results['datetime']
        for i in rankings:
            if (i['country_full'] == hometeam) and (matchday > i['datetime']):
                rankings_dates.append(i['datetime'])
            current_rank_date = max(rankings_dates)
            if (i['datetime'] == current_rank_date) and (i['country_full'] == hometeam):
                ranks.append(i['rank'])
    return ranks

In [276]:
results['home_rank'].map(home_rank())

TypeError: string indices must be integers

In [296]:
def home_rank(game):
    try:
        matchday = game['datetime']
        hometeam = game['home_team']
        hometeam_rankings = rankings[rankings['country_full'] == hometeam]
        latest_rank = hometeam_rankings[hometeam_rankings['datetime'] <= matchday]['datetime'].tail(1).values[0]
        current_rank = rankings[rankings['datetime'] == latest_rank]['rank']
    except:
        current_rank = 0
    return current_rank

In [297]:
results['home_rank'] = results.apply(home_rank, axis=1)

ValueError: cannot reindex from a duplicate axis

In [None]:
results.head(60)

In [293]:
rankings[rankings['country_full'].str.contains('Botswana')]

Unnamed: 0,rank,country_full,rank_date,datetime
73,139.0,Botswana,1992-12-31,1992-12-31
257,135.0,Botswana,1993-08-08,1993-08-08
400,136.0,Botswana,1993-09-23,1993-09-23
566,137.0,Botswana,1993-10-22,1993-10-22
731,137.0,Botswana,1993-11-19,1993-11-19
...,...,...,...,...
778,149.0,Botswana,2021-11-19,2021-11-19
148,149.0,Botswana,2021-08-12,2021-08-12
359,150.0,Botswana,2021-09-16,2021-09-16
568,149.0,Botswana,2021-10-21,2021-10-21
