In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
plt.rcParams['figure.figsize'] = (30,15)
import seaborn as sns
import patsy
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer

# Data cleaning

Data source: https://www.kaggle.com/martj42/international-football-results-from-1872-to-2017/data?select=results.csv

In [2]:
bigData = pd.read_csv('results.csv')

In [3]:
bigData.head(10)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Glasgow,Scotland,False
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,London,England,False
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Glasgow,Scotland,False
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,London,England,False
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Glasgow,Scotland,False
5,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Glasgow,Scotland,False
6,1877-03-03,England,Scotland,1.0,3.0,Friendly,London,England,False
7,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Wrexham,Wales,False
8,1878-03-02,Scotland,England,7.0,2.0,Friendly,Glasgow,Scotland,False
9,1878-03-23,Scotland,Wales,9.0,0.0,Friendly,Glasgow,Scotland,False


In [4]:
bigData.drop(['city', 'country', 'neutral'], axis=1, inplace=True)

In [5]:
bigData.head(10)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament
0,1872-11-30,Scotland,England,0.0,0.0,Friendly
1,1873-03-08,England,Scotland,4.0,2.0,Friendly
2,1874-03-07,Scotland,England,2.0,1.0,Friendly
3,1875-03-06,England,Scotland,2.0,2.0,Friendly
4,1876-03-04,Scotland,England,3.0,0.0,Friendly
5,1876-03-25,Scotland,Wales,4.0,0.0,Friendly
6,1877-03-03,England,Scotland,1.0,3.0,Friendly
7,1877-03-05,Wales,Scotland,0.0,2.0,Friendly
8,1878-03-02,Scotland,England,7.0,2.0,Friendly
9,1878-03-23,Scotland,Wales,9.0,0.0,Friendly


Determine who is the winner by comparing goals scored home vs away

In [6]:
winner = []

for i in range(len(bigData['home_team'])):
    if bigData['home_score'][i] > bigData['away_score'][i]:
        winner.append(bigData['home_team'][i])
    elif bigData['home_score'][i] < bigData['away_score'][i]:
        winner.append(bigData['away_team'][i])
    else:
        winner.append('Draw')
        
bigData['winning_team'] = winner

In [7]:
#pos for home win, neg for away win
bigData['goal_diff'] = bigData['home_score'] - bigData['away_score']

In [8]:
modified = bigData
modified.head(10)

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,winning_team,goal_diff
0,1872-11-30,Scotland,England,0.0,0.0,Friendly,Draw,0.0
1,1873-03-08,England,Scotland,4.0,2.0,Friendly,England,2.0
2,1874-03-07,Scotland,England,2.0,1.0,Friendly,Scotland,1.0
3,1875-03-06,England,Scotland,2.0,2.0,Friendly,Draw,0.0
4,1876-03-04,Scotland,England,3.0,0.0,Friendly,Scotland,3.0
5,1876-03-25,Scotland,Wales,4.0,0.0,Friendly,Scotland,4.0
6,1877-03-03,England,Scotland,1.0,3.0,Friendly,Scotland,-2.0
7,1877-03-05,Wales,Scotland,0.0,2.0,Friendly,Scotland,-2.0
8,1878-03-02,Scotland,England,7.0,2.0,Friendly,Scotland,5.0
9,1878-03-23,Scotland,Wales,9.0,0.0,Friendly,Scotland,9.0


In [9]:
modified.drop(['date', 'home_score', 'away_score', 'tournament', 'winning_team'], inplace=True, axis=1)

In [10]:
modified.head(10)

Unnamed: 0,home_team,away_team,goal_diff
0,Scotland,England,0.0
1,England,Scotland,2.0
2,Scotland,England,1.0
3,England,Scotland,0.0
4,Scotland,England,3.0
5,Scotland,Wales,4.0
6,England,Scotland,-2.0
7,Wales,Scotland,-2.0
8,Scotland,England,5.0
9,Scotland,Wales,9.0


Format how to feed the model. We will feed team matches as input to see effect on outcomes of matches
<br>Matches will be coded as 2 = home win, 1 = draw, 0 = away win

In [11]:
output = []

for i in range(len(modified['goal_diff'])):
    if modified['goal_diff'][i] > 0:
        output.append(2)
    elif modified['goal_diff'][i] < 0:
        output.append(0)
    else:
        output.append(1)
        
modified['winning_team'] = output
#2 for home win, 1 for tie, 0 for away win

In [12]:
modified.drop(['goal_diff'], axis=1, inplace=True)

In [13]:
modified.head(10)

Unnamed: 0,home_team,away_team,winning_team
0,Scotland,England,1
1,England,Scotland,2
2,Scotland,England,2
3,England,Scotland,1
4,Scotland,England,2
5,Scotland,Wales,2
6,England,Scotland,0
7,Wales,Scotland,0
8,Scotland,England,2
9,Scotland,Wales,2


Use pd.get_dummies() as a workaround for one-hot encoding, and to make the team parinings a continuous binary input

In [14]:
final = pd.get_dummies(modified, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

X = final.drop(['winning_team'], axis=1)
Y = final['winning_team']
Y = Y.astype(int)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [15]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
score_train = logreg.score(X_train, Y_train)
score_test = logreg.score(X_test, Y_test)

print("Training acc: ", '%.3f'%(score_train))
print("Test acc: ", '%.3f'%(score_test))

Training acc:  0.587
Test acc:  0.562


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Predicting with 2018 World Cup Matches

Data sources:
    
ranking:https://us.soccerway.com/teams/rankings/fifa/?ICID=TN_03_05_01    March 2018
    
fixtures: https://fixturedownload.com/results/fifa-world-cup-2018

In [16]:
# Loading new datasets
ranking = pd.read_csv('fifa_rankings.csv') #as of April 2018
fixtures = pd.read_csv('fixtures.csv') #FIFA World Cup 2018 match list (bracket)

# List for storing the group stage games
pred_set = []

In [17]:
fixtures.head(10)

Unnamed: 0,Round Number,Date,Location,Home Team,Away Team,Group,Result
0,1,14/06/2018 18:00,"Luzhniki Stadium, Moscow",Russia,Saudi Arabia,Group A,
1,1,15/06/2018 15:00,Ekaterinburg Stadium,Egypt,Uruguay,Group A,
2,1,15/06/2018 18:00,Saint Petersburg Stadium,Morocco,Iran,Group B,
3,1,15/06/2018 21:00,"Fisht Stadium, Sochi",Portugal,Spain,Group B,
4,1,16/06/2018 13:00,Kazan Arena,France,Australia,Group C,
5,1,16/06/2018 16:00,"Otkrytiye Arena, Moscow",Argentina,Iceland,Group D,
6,1,16/06/2018 19:00,Saransk Stadium,Peru,Denmark,Group C,
7,1,16/06/2018 22:00,Kaliningrad Stadium,Croatia,Nigeria,Group D,
8,1,17/06/2018 15:00,Samara Stadium,Costa Rica,Serbia,Group E,
9,1,17/06/2018 18:00,"Luzhniki Stadium, Moscow",Germany,Mexico,Group F,


In [18]:
# Add each team's rank
fixtures.insert(1, 'first_position', fixtures['Home Team'].map(ranking.set_index('Team')['Position']))
fixtures.insert(2, 'second_position', fixtures['Away Team'].map(ranking.set_index('Team')['Position']))

# Take the 48 round 1 world stage games
fixtures = fixtures.iloc[:48, :]
fixtures.tail()

Unnamed: 0,Round Number,first_position,second_position,Date,Location,Home Team,Away Team,Group,Result
43,3,6.0,25.0,27/06/2018 21:00,Nizhny Novgorod Stadium,Switzerland,Costa Rica,Group E,
44,3,60.0,10.0,28/06/2018 17:00,Volgograd Stadium,Japan,Poland,Group H,
45,3,28.0,16.0,28/06/2018 17:00,Samara Stadium,Senegal,Colombia,Group H,
46,3,55.0,14.0,28/06/2018 21:00,Saransk Stadium,Panama,Tunisia,Group G,
47,3,13.0,3.0,28/06/2018 21:00,Kaliningrad Stadium,England,Belgium,Group G,


In [19]:
# Home team set as the one with the higher world position
for index, row in fixtures.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'home_team': row['Home Team'], 'away_team': row['Away Team'], 'winning_team': None})
    else:
        pred_set.append({'home_team': row['Away Team'], 'away_team': row['Home Team'], 'winning_team': None})
        
pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set

pred_set.head()

Unnamed: 0,home_team,away_team,winning_team
0,Russia,Saudi Arabia,
1,Uruguay,Egypt,
2,Iran,Morocco,
3,Portugal,Spain,
4,France,Australia,


In [20]:
# Get dummy variables and drop winning_team column
pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

# Fit group stage one-hot to previously trained all teams one-hot
missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]

pred_set = pred_set.drop(['winning_team'], axis=1)

pred_set.head()

  pred_set[c] = 0


Unnamed: 0,home_team_Abkhazia,home_team_Afghanistan,home_team_Albania,home_team_Alderney,home_team_Algeria,home_team_American Samoa,home_team_Andalusia,home_team_Andorra,home_team_Angola,home_team_Anguilla,...,away_team_Western Sahara,away_team_Yemen,away_team_Yemen DPR,away_team_Ynys Môn,away_team_Yorkshire,away_team_Yugoslavia,away_team_Zambia,away_team_Zanzibar,away_team_Zimbabwe,away_team_Åland Islands
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
#group stage matches
predictions = logreg.predict(pred_set)

for i in range(fixtures.shape[0]):
    away_team = backup_pred_set.iloc[i,1]
    home_team = backup_pred_set.iloc[i, 0]
    outcome = predictions[i]
    
    print("%s vs %s" % (away_team, home_team))
    if outcome == 2:
        print("Winner: %s" % (away_team))
    elif outcome == 1:
        print("Draw")
    elif outcome == 0:
        print("Winner: %s" % (home_team))
        
    print("Probability of %s winning: %.3f" % (away_team, logreg.predict_proba(pred_set)[i][2]))
    print('Probability of Draw: %.3f' % (logreg.predict_proba(pred_set)[i][1]))
    print("Probability of %s winning: %.3f" % (home_team, logreg.predict_proba(pred_set)[i][0]))
    print("")

Saudi Arabia vs Russia
Winner: Saudi Arabia
Probability of Saudi Arabia winning: 0.734
Probability of Draw: 0.175
Probability of Russia winning: 0.091

Egypt vs Uruguay
Winner: Egypt
Probability of Egypt winning: 0.650
Probability of Draw: 0.275
Probability of Uruguay winning: 0.074

Morocco vs Iran
Winner: Morocco
Probability of Morocco winning: 0.392
Probability of Draw: 0.372
Probability of Iran winning: 0.236

Spain vs Portugal
Draw
Probability of Spain winning: 0.292
Probability of Draw: 0.381
Probability of Portugal winning: 0.327

Australia vs France
Winner: Australia
Probability of Australia winning: 0.638
Probability of Draw: 0.205
Probability of France winning: 0.157

Iceland vs Argentina
Winner: Iceland
Probability of Iceland winning: 0.844
Probability of Draw: 0.125
Probability of Argentina winning: 0.031

Denmark vs Peru
Winner: Peru
Probability of Denmark winning: 0.368
Probability of Draw: 0.204
Probability of Peru winning: 0.428

Nigeria vs Croatia
Winner: Nigeria
Proba

Setting the actual group 16 matches and predicting them

In [22]:
# List of tuples before 
group_16 = [('Uruguay', 'Portugal'),
            ('France', 'Croatia'),
            ('Brazil', 'Mexico'),
            ('England', 'Colombia'),
            ('Spain', 'Russia'),
            ('Argentina', 'Peru'),
            ('Germany', 'Switzerland'),
            ('Poland', 'Belgium')]

In [23]:
def aggregate_predict(matches, ranking, final, logreg):

    # Initialization of auxiliary list for data cleaning
    positions = []

    # Loop to retrieve each team's position according to FIFA ranking
    for match in matches:
        positions.append(ranking.loc[ranking['Team'] == match[0],'Position'].iloc[0])
        positions.append(ranking.loc[ranking['Team'] == match[1],'Position'].iloc[0])
    
    # Creating the DataFrame for prediction
    pred_set = []

    # Initializing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'positions' list, and 'j' for the list of matches (list of tuples)
    while i < len(positions):
        dict1 = {}

        # If position of first team is better, he will be the 'home' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'home_team': matches[j][0], 'away_team': matches[j][1]})
        else:
            dict1.update({'home_team': matches[j][1], 'away_team': matches[j][0]})

        # Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1

    # Convert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    # Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix=['home_team', 'away_team'], columns=['home_team', 'away_team'])

    # Add missing columns compared to the model's training dataset
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    # Remove winning team column
    pred_set = pred_set.drop(['winning_team'], axis=1)

    predictions = logreg.predict(pred_set)
    for i in range(len(pred_set)):
        away_team = backup_pred_set.iloc[i,1]
        home_team = backup_pred_set.iloc[i, 0]
        outcome = predictions[i]

        print("%s vs %s" % (away_team, home_team))
        if outcome == 2:
            print("Winner: %s" % (away_team))
        elif outcome == 1:
            print("Draw")
        elif outcome == 0:
            print("Winner: %s" % (home_team))

        print("Probability of %s winning: %.3f" % (away_team, logreg.predict_proba(pred_set)[i][2]))
        print('Probability of Draw: %.3f' % (logreg.predict_proba(pred_set)[i][1]))
        print("Probability of %s winning: %.3f" % (home_team, logreg.predict_proba(pred_set)[i][0]))
        print("")

In [24]:
aggregate_predict(group_16, ranking, final, logreg)

Uruguay vs Portugal
Winner: Uruguay
Probability of Uruguay winning: 0.456
Probability of Draw: 0.265
Probability of Portugal winning: 0.278

Croatia vs France
Winner: Croatia
Probability of Croatia winning: 0.463
Probability of Draw: 0.271
Probability of France winning: 0.265

Mexico vs Brazil
Winner: Mexico
Probability of Mexico winning: 0.697
Probability of Draw: 0.196
Probability of Brazil winning: 0.107

Colombia vs England
Winner: Colombia
Probability of Colombia winning: 0.575
Probability of Draw: 0.281
Probability of England winning: 0.144

Russia vs Spain
Winner: Russia
Probability of Russia winning: 0.544
Probability of Draw: 0.266
Probability of Spain winning: 0.190

Peru vs Argentina
Winner: Peru
Probability of Peru winning: 0.711
Probability of Draw: 0.223
Probability of Argentina winning: 0.066

Switzerland vs Germany
Winner: Switzerland
Probability of Switzerland winning: 0.708
Probability of Draw: 0.165
Probability of Germany winning: 0.127

Poland vs Belgium
Winner: Pol

  pred_set[c] = 0
