# Data Collection

In [12]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup

---

### Results DF

In [13]:
results = pd.read_csv('./data/results.csv')
results.head()
# data from kaggle:  https://www.kaggle.com/martj42/international-football-results-from-1872-to-2017?select=results.csv

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False


In [14]:
results.shape

(43045, 9)

In [15]:
results[results['date'] < '1992-12-31'].shape  # <- 1st date of FIFA rankings

(17489, 9)

In [16]:
results.drop(results[results['date'] < '1992-12-31'].index, inplace=True)

In [17]:
results.shape

(25556, 9)

In [18]:
results.reset_index(inplace=True)

In [19]:
results.drop(columns='index', inplace=True)
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False


---

### Upcoming Fixtures DF

In [20]:
qualifying_fixtures = pd.read_csv('./data/concacaf_wcq_fixtures.csv')
qualifying_fixtures.tail()
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
54,2022-03-29,Panama,Canada,,,FIFA World Cup qualification,,,
55,2022-03-29,Costa Rica,United States,,,FIFA World Cup qualification,,,
56,,,,,,,,,
57,,,,,,,,,
58,,,,,,,,,


In [23]:
qualifying_fixtures[qualifying_fixtures['date'].isnull()]

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
56,,,,,,,,,
57,,,,,,,,,
58,,,,,,,,,


In [24]:
qualifying_fixtures.drop(index=qualifying_fixtures[qualifying_fixtures['date'].isnull()].index, inplace=True)

---

### Current Table DF

In [25]:
table = pd.read_csv('./data/current_table_12.14.2021.csv')
table
# data from FIFA site:  https://www.fifa.com/tournaments/mens/worldcup/qatar2022/qualifiers/concacaf

Unnamed: 0,rank,team_name,matches_played,wins,draws,losses,goals_for,goals_against,goal_differential,points
0,1,Canada,8,4,4,0,13,5,8,16
1,2,USA,8,4,3,1,12,5,7,15
2,3,Mexico,8,4,2,2,11,7,4,14
3,4,Panama,8,4,2,2,11,9,2,14
4,5,Costa Rica,8,2,3,3,6,7,-1,9
5,6,Jamaica,8,1,4,3,6,10,-4,7
6,7,El Salvador,8,1,3,4,4,10,-6,6
7,8,Honduras,8,0,3,5,5,15,-10,3


---

### World Rankings DF

In [26]:
past_rankings = pd.read_csv('./data/fifa_ranking-2021-05-27.csv')
past_rankings.head()
# data from kaggle:  https://www.kaggle.com/cashncarry/fifaworldranking

Unnamed: 0,id,rank,country_full,country_abrv,total_points,previous_points,rank_change,confederation,rank_date
0,43818,82,Iraq,IRQ,15,0,0,AFC,1992-12-31
1,43873,107,Mozambique,MOZ,9,0,0,CAF,1992-12-31
2,43816,108,Indonesia,IDN,9,0,0,AFC,1992-12-31
3,1882218,109,Antigua and Barbuda,ATG,8,0,0,CONCACAF,1992-12-31
4,43820,110,Jordan,JOR,8,0,0,AFC,1992-12-31


In [27]:
past_rankings = past_rankings[['rank', 'country_full', 'rank_date']]
past_rankings.head()

Unnamed: 0,rank,country_full,rank_date
0,82,Iraq,1992-12-31
1,107,Mozambique,1992-12-31
2,108,Indonesia,1992-12-31
3,109,Antigua and Barbuda,1992-12-31
4,110,Jordan,1992-12-31


In [28]:
past_rankings.shape

(63054, 3)

In [29]:
recent_rankings = pd.read_csv('./data/fifa_ranking_2021-11-19.csv')
recent_rankings.head()
# data from FIFA:  https://www.fifa.com/fifa-world-ranking/

Unnamed: 0,rank,country_full,rank_date
0,1.0,Belgium,2021-08-12
1,2.0,Brazil,2021-08-12
2,3.0,France,2021-08-12
3,4.0,England,2021-08-12
4,5.0,Italy,2021-08-12


In [30]:
recent_rankings.shape

(1678, 3)

In [31]:
rankings = pd.concat([past_rankings, recent_rankings])
rankings.shape

(64732, 3)

---

### Combining the Data

In [32]:
rankings.dropna(inplace=True)

In [33]:
Colombia_rankings = rankings[rankings['country_full'] == 'Colombia'].copy()

In [34]:
results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
25551,2021-11-16,Venezuela,Peru,1.0,2.0,FIFA World Cup qualification,Caracas,Venezuela,False
25552,2021-11-16,Colombia,Paraguay,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False
25553,2021-11-16,Argentina,Brazil,0.0,0.0,FIFA World Cup qualification,San Juan,Argentina,False
25554,2021-11-16,Chile,Ecuador,0.0,2.0,FIFA World Cup qualification,Santiago,Chile,False
25555,2021-11-19,Sri Lanka,Seychelles,,,Mahinda Rajapaksa Cup,Columbo,Sri Lanka,False


In [35]:
Colombia_results = results[results['home_team'] == 'Colombia'].copy()

In [36]:
Colombia_results['datetime'] = pd.to_datetime(Colombia_results['date'])
Colombia_rankings['datetime'] = pd.to_datetime(Colombia_rankings['rank_date'])

In [37]:
Colombia_rankings.tail()

Unnamed: 0,rank,country_full,rank_date,datetime
62906,15.0,Colombia,2021-05-27,2021-05-27
14,15.0,Colombia,2021-08-12,2021-08-12
225,16.0,Colombia,2021-09-16,2021-09-16
435,16.0,Colombia,2021-10-21,2021-10-21
645,16.0,Colombia,2021-11-19,2021-11-19


In [38]:
Colombia_results.tail()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,datetime
24950,2021-06-20,Colombia,Peru,1.0,2.0,Copa América,Goiânia,Brazil,True,2021-06-20
25242,2021-09-09,Colombia,Chile,3.0,1.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-09-09
25341,2021-10-10,Colombia,Brazil,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-10-10
25404,2021-10-14,Colombia,Ecuador,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-10-14
25552,2021-11-16,Colombia,Paraguay,0.0,0.0,FIFA World Cup qualification,Barranquilla,Colombia,False,2021-11-16


In [39]:
Colombia_rankings[Colombia_rankings.index == 62906]

Unnamed: 0,rank,country_full,rank_date,datetime
62906,15.0,Colombia,2021-05-27,2021-05-27


In [40]:
# Colombia_results.loc[Colombia_results.index==24950, 'home_rank'] = Colombia_rankings[Colombia_rankings['datetime'] < Colombia_results.loc[24950, 'datetime']].tail(1)['rank'].values[0]

In [41]:
Colombia_rankings[Colombia_rankings['datetime'] < Colombia_results.loc[24950, 'datetime']].tail(1)['rank'].values

array([15.])

In [42]:
results['home_rank'] = 0

In [43]:
results['away_rank'] = 0

In [44]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True,0,0
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False,0,0
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False,0,0
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True,0,0
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False,0,0


In [45]:
results_dict = results.to_dict('records')
results_dict[0:2]

[{'date': '1993-01-01',
  'home_team': 'Ghana',
  'away_team': 'Mali',
  'home_score': 1.0,
  'away_score': 1.0,
  'tournament': 'Friendly',
  'city': 'Libreville',
  'country': 'Gabon',
  'neutral': True,
  'home_rank': 0,
  'away_rank': 0},
 {'date': '1993-01-02',
  'home_team': 'Gabon',
  'away_team': 'Burkina Faso',
  'home_score': 1.0,
  'away_score': 1.0,
  'tournament': 'Friendly',
  'city': 'Libreville',
  'country': 'Gabon',
  'neutral': False,
  'home_rank': 0,
  'away_rank': 0}]

In [46]:
rankings_dict = rankings.to_dict('records')
rankings_dict[0:2]

[{'rank': 82.0, 'country_full': 'Iraq', 'rank_date': '1992-12-31'},
 {'rank': 107.0, 'country_full': 'Mozambique', 'rank_date': '1992-12-31'}]

In [47]:
results['datetime'] = pd.to_datetime(results['date'])
rankings['datetime'] = pd.to_datetime(rankings['rank_date'])

In [48]:
results.iloc[0]

date                   1993-01-01
home_team                   Ghana
away_team                    Mali
home_score                    1.0
away_score                    1.0
tournament               Friendly
city                   Libreville
country                     Gabon
neutral                      True
home_rank                       0
away_rank                       0
datetime      1993-01-01 00:00:00
Name: 0, dtype: object

In [49]:
def home_rank():
    rankings_dates = []
    ranks = []
    for g in results:
        hometeam = results['home_team']
        matchday = results['datetime']
        for i in rankings:
            if (i['country_full'] == hometeam) and (matchday > i['datetime']):
                rankings_dates.append(i['datetime'])
            current_rank_date = max(rankings_dates)
            if (i['datetime'] == current_rank_date) and (i['country_full'] == hometeam):
                ranks.append(i['rank'])
    return ranks

In [50]:
results['home_rank'].map(home_rank())

TypeError: string indices must be integers

In [None]:
def home_rank(game):
    try:
        matchday = game['datetime']
        hometeam = game['home_team']
        hometeam_rankings = rankings[rankings['country_full'] == hometeam]
        latest_rank = max(hometeam_rankings[hometeam_rankings['datetime'] <= matchday]['datetime'])
        current_rank = rankings[rankings['datetime'] == latest_rank]['rank']
    except:
        current_rank = 0
    return current_rank

In [51]:
results['home_rank'] = results.apply(home_rank, axis=1)

TypeError: home_rank() takes 0 positional arguments but 1 was given

In [52]:
results.head()

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral,home_rank,away_rank,datetime
0,1993-01-01,Ghana,Mali,1.0,1.0,Friendly,Libreville,Gabon,True,0,0,1993-01-01
1,1993-01-02,Gabon,Burkina Faso,1.0,1.0,Friendly,Libreville,Gabon,False,0,0,1993-01-02
2,1993-01-02,Kuwait,Lebanon,2.0,0.0,Friendly,Kuwait City,Kuwait,False,0,0,1993-01-02
3,1993-01-03,Burkina Faso,Mali,1.0,0.0,Friendly,Libreville,Gabon,True,0,0,1993-01-03
4,1993-01-03,Gabon,Ghana,2.0,3.0,Friendly,Libreville,Gabon,False,0,0,1993-01-03


In [53]:
rankings.head()

Unnamed: 0,rank,country_full,rank_date,datetime
0,82.0,Iraq,1992-12-31,1992-12-31
1,107.0,Mozambique,1992-12-31,1992-12-31
2,108.0,Indonesia,1992-12-31,1992-12-31
3,109.0,Antigua and Barbuda,1992-12-31,1992-12-31
4,110.0,Jordan,1992-12-31,1992-12-31


In [54]:
results['home_rank'].value_counts()

0    25556
Name: home_rank, dtype: int64

In [55]:
results.shape

(25556, 12)