In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

rankings = pd.read_csv('fifa_ranking.csv')
rankings = rankings.loc[:,['rank', 'country_full', 'country_abrv', 'cur_year_avg_weighted', 'rank_date', 
                           'two_year_ago_weighted', 'three_year_ago_weighted']]
rankings = rankings.replace({"IR Iran": "Iran"})
rankings['weighted_points'] =  rankings['cur_year_avg_weighted'] + rankings['two_year_ago_weighted'] + rankings['three_year_ago_weighted']
rankings['rank_date'] = pd.to_datetime(rankings['rank_date'])

matches = pd.read_csv('results.csv')
matches =  matches.replace({'Germany DR': 'Germany', 'China': 'China PR'})
matches['date'] = pd.to_datetime(matches['date'])

In [2]:
# I want to have the ranks for every day 
rankings = rankings.set_index(['rank_date']).groupby(['country_full'], group_keys=False).resample('D').first().fillna(method='ffill').reset_index()

# join the ranks
matches = matches.merge(rankings, 
                        left_on=['date', 'home_team'], 
                        right_on=['rank_date', 'country_full'])
matches = matches.merge(rankings, 
                        left_on=['date', 'away_team'], 
                        right_on=['rank_date', 'country_full'], 
                        suffixes=('_home', '_away'))

In [3]:
# feature generation
matches['rank_difference'] = matches['rank_home'] - matches['rank_away']
matches['average_rank'] = (matches['rank_home'] + matches['rank_away'])/2
matches['point_difference'] = matches['weighted_points_home'] - matches['weighted_points_away']
matches['score_difference'] = matches['home_score'] - matches['away_score']
matches['is_won'] = matches['score_difference'] > 0 # take draw as lost
#matches.loc[matches['score_difference'] > 0, 'is_won'] = 'True' 
#matches.loc[matches['score_difference'] < 0, 'is_won'] = 'False'  
#matches.loc[matches['score_difference'] == 0, 'is_won'] = 'Draw'  
matches['is_stake'] = matches['tournament'] != 'Friendly'


In [4]:
#matches = matches.set_index(['date'])
match = matches[['home_team','away_team','tournament','average_rank', 'rank_difference', 'point_difference','is_stake','is_won']]

matches1 = matches[['home_team','away_team','tournament','average_rank', 'rank_difference', 'point_difference','is_stake','home_score']]
matches2 = matches[['home_team','away_team','tournament','average_rank', 'rank_difference', 'point_difference','is_stake','away_score']]

#matches1 = matches[['home_team','home_score','away_team','away_score','tournament','average_rank', 'rank_difference', 'point_difference','is_stake']]
#matches2 = matches[['home_team','home_score','away_team','away_score','tournament','average_rank', 'rank_difference', 'point_difference','is_stake']]

In [5]:
match.head()

Unnamed: 0,home_team,away_team,tournament,average_rank,rank_difference,point_difference,is_stake,is_won
0,Bolivia,Uruguay,FIFA World Cup qualification,40.5,37.0,0.0,True,True
1,Brazil,Mexico,Friendly,11.0,-6.0,0.0,False,False
2,Ecuador,Venezuela,FIFA World Cup qualification,64.5,-59.0,0.0,True,True
3,Guinea,Sierra Leone,Friendly,75.5,-21.0,0.0,False,True
4,Paraguay,Argentina,FIFA World Cup qualification,36.0,62.0,0.0,True,False


In [6]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
labelencoder = LabelEncoder()

X, y = match.loc[:,['home_team', 'away_team','tournament', 'average_rank', 'rank_difference','point_difference', 'is_stake']], matches['is_won']


X['home_team'] = labelencoder.fit_transform(X['home_team'])
X['away_team'] = labelencoder.fit_transform(X['away_team'])
X['tournament'] = labelencoder.fit_transform(X['tournament'])
X['is_stake'] = labelencoder.fit_transform(X['is_stake'])
y = labelencoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)





# Fitting Decision Tree Classification to the Training set
param_dist = {"max_depth": [3, 100],
              "max_features": randint(3,7),
              "min_samples_leaf": randint(1, 9),
              "criterion": ["gini", "entropy"]}




model = DecisionTreeClassifier()

model.fit(X_train,y_train)
pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred)

#accuracy
from sklearn.metrics import confusion_matrix,accuracy_score
accuracy = accuracy_score(y_test,pred)
print("confusion matrics=",cm)
print("  ")
print("accuracy=",accuracy*100)

param_grid = {'max_depth': np.arange(3, 10)}

tree_cv = GridSearchCV(model, param_grid)
tree_cv.fit(X_train,y_train)

tree_preds = tree_cv.predict_proba(X_test)[:, 1]
print("Best score is {}".format(tree_cv.best_score_))
print("Tuned GRID SEARCH Tree Parameters: {}".format(tree_cv.best_params_))





confusion matrics= [[1150  715]
 [ 707  978]]
  
accuracy= 59.943661971830984
Best score is 0.6770422535211267
Tuned GRID SEARCH Tree Parameters: {'max_depth': 4}


In [7]:
X1, y1 = matches1.loc[:,['home_team', 'away_team','tournament', 'average_rank', 'rank_difference','point_difference', 'is_stake']], matches['home_score']

X1['home_team'] = labelencoder.fit_transform(X1['home_team'])
X1['away_team'] = labelencoder.fit_transform(X1['away_team'])
X1['tournament'] = labelencoder.fit_transform(X1['tournament'])
X1['is_stake'] = labelencoder.fit_transform(X1['is_stake'])

y1 = labelencoder.fit_transform(y1)

X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X1, y1, test_size=0.2, random_state=42)

model1 = DecisionTreeClassifier()

model1.fit(X_train1,y_train1)
pred1 = model1.predict(X_test1)
from sklearn.metrics import confusion_matrix
cm1 = confusion_matrix(y_test1, pred1)

#accuracy
from sklearn.metrics import confusion_matrix,accuracy_score
accuracy1 = accuracy_score(y_test1,pred1)
print("  ")
print("accuracy=",accuracy1*100)

tree_cv1 = GridSearchCV(model1, param_grid)
tree_cv1.fit(X_train1,y_train1)

tree_preds1 = tree_cv1.predict_proba(X_test1)[:, 1]
print("Best score is {}".format(tree_cv1.best_score_))
print("Tuned GRID SEARCH Tree Parameters: {}".format(tree_cv1.best_params_))


  
accuracy= 27.267605633802816




Best score is 0.331830985915493
Tuned GRID SEARCH Tree Parameters: {'max_depth': 4}


In [8]:


X2, y2 = matches2.loc[:,['home_team', 'away_team','tournament', 'average_rank', 'rank_difference','point_difference', 'is_stake']], matches['away_score']



X2['home_team'] = labelencoder.fit_transform(X2['home_team'])
X2['away_team'] = labelencoder.fit_transform(X2['away_team'])
X2['tournament'] = labelencoder.fit_transform(X2['tournament'])
X2['is_stake'] = labelencoder.fit_transform(X2['is_stake'])
y2 = labelencoder.fit_transform(y2)



X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X2, y2, test_size=0.2, random_state=42)
    


model2 = DecisionTreeClassifier()

model2.fit(X_train2,y_train2)
pred2 = model2.predict(X_test2)
from sklearn.metrics import confusion_matrix
cm2 = confusion_matrix(y_test2, pred2)

#accuracy
from sklearn.metrics import confusion_matrix,accuracy_score
accuracy2 = accuracy_score(y_test2,pred2)
print("  ")
print("accuracy=",accuracy2*100)

tree_cv2 = GridSearchCV(model2, param_grid)
tree_cv2.fit(X_train2,y_train2)
tree_preds2 = tree_cv2.predict_proba(X_test2)[:, 1]
print("Best score is {}".format(tree_cv2.best_score_))
print("Tuned GRID SEARCH Tree Parameters: {}".format(tree_cv2.best_params_))

  
accuracy= 32.0




Best score is 0.4235915492957746
Tuned GRID SEARCH Tree Parameters: {'max_depth': 3}


# Logisitic Regression

In [9]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

X3, y3 = matches.loc[:,['average_rank', 'rank_difference', 'point_difference', 'is_stake']], matches['is_won']

#labelencoder.fit(X3)


X_train3, X_test3, y_train3, y_test3 = train_test_split(
    X3, y3, test_size=0.2, random_state=42)

logreg = linear_model.LogisticRegression(C=1e-5)
features = PolynomialFeatures(degree=2)
model3 = Pipeline([
    ('polynomial_features', features),
    ('logistic_regression', logreg)
])
model3 = model3.fit(X_train3, y_train3)

pred3 = model3.predict(X_test3)
home_win = model3.predict_proba([[74.0,10.0,0.00,True]])

print(X_test3.head())
print(home_win)
from sklearn.metrics import confusion_matrix
cm3 = confusion_matrix(y_test3, pred3)

#accuracy
from sklearn.metrics import confusion_matrix,accuracy_score
accuracy3 = accuracy_score(y_test3,pred3)
print("confusion matrics=",cm3)
print("  ")
print("accuracy=",accuracy3*100)


       average_rank  rank_difference  point_difference  is_stake
476            74.0             10.0              0.00      True
6959          165.0             32.0              0.00      True
13005         123.0             18.0           -138.76      True
8466           24.5            -45.0              0.00     False
15568          75.0             28.0           -172.41      True
[[0.5976807 0.4023193]]
confusion matrics= [[1277  588]
 [ 570 1115]]
  
accuracy= 67.38028169014085


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Predicting

In [10]:
# let's define the rankings at the time of the World Cup
last_rank = rankings.loc[(rankings['rank_date'] == rankings['rank_date'].max()) & 
                                    rankings['country_full']]
print(last_rank)
last_rank.to_csv('file1.csv') 
print(rankings.columns)

         rank_date   rank    country_full country_abrv  cur_year_avg_weighted  \
5622    2018-06-07  145.0     Afghanistan          AFG                 112.98   
14692   2018-06-07   58.0         Albania          ALB                 233.86   
23762   2018-06-07   66.0         Algeria          ALG                 163.41   
30932   2018-06-07  192.0  American Samoa          ASA                   0.00   
38802   2018-06-07  130.0         Andorra          AND                 189.89   
...            ...    ...             ...          ...                    ...   
1788692 2018-06-07  102.0         Vietnam          VIE                 222.70   
1797762 2018-06-07   18.0           Wales          WAL                 335.47   
1806832 2018-06-07  133.0           Yemen          YEM                 113.42   
1821587 2018-06-07   76.0          Zambia          ZAM                 236.14   
1830657 2018-06-07  118.0        Zimbabwe          ZIM                 133.45   

         two_year_ago_weigh

In [46]:
x="Afghanistan"
y="France"
home = last_rank.loc[last_rank['country_full'] == x]
away = last_rank.loc[last_rank['country_full'] == y]

frames = [home, away]
result = pd.concat(frames)

print(result)
row = pd.DataFrame(np.array([[np.nan, np.nan, np.nan, True]]), columns=X_test3.columns)
home_rank = result['rank'].iloc[0]
home_points = result['weighted_points'].iloc[0]
opp_rank = result['rank'].iloc[1]

opp_points = result['weighted_points'].iloc[1]
row['average_rank'] = (home_rank + opp_rank) / 2
row['rank_difference'] = home_rank - opp_rank
row['point_difference'] = home_points - opp_points

        rank_date   rank country_full country_abrv  cur_year_avg_weighted  \
5622   2018-06-07  145.0  Afghanistan          AFG                 112.98   
619586 2018-06-07    7.0       France          FRA                 520.12   

        two_year_ago_weighted  three_year_ago_weighted  weighted_points  
5622                    31.69                     6.80           151.47  
619586                 118.09                   131.54           769.75  


In [47]:
home_win = model3.predict_proba(row)
print(row)
print(home_win[0][0])

   average_rank  rank_difference  point_difference  is_stake
0          76.0            138.0           -618.28       1.0
0.9451444141329193


In [48]:
if home_win[0][0]>=0.45 and home_win[0][0]<=0.55:
    print("draw")
if home_win[0][0]>0.55:
    print(result['country_full'].iloc[1], "win with a probability of",home_win[0][0])
if home_win[0][1]>0.55:
    print(result['country_full'].iloc[0], " win with a probability of",home_win[0][1])

France win with a probability of 0.9451444141329193


In [49]:
import joblib

In [50]:
joblib.dump(model3,'model1.pkl')

['model1.pkl']

In [51]:
X_test3.columns

Index(['average_rank', 'rank_difference', 'point_difference', 'is_stake'], dtype='object')

In [52]:
rankings.columns

Index(['rank_date', 'rank', 'country_full', 'country_abrv',
       'cur_year_avg_weighted', 'two_year_ago_weighted',
       'three_year_ago_weighted', 'weighted_points'],
      dtype='object')

In [53]:
y in last_rank.country_full

False

In [54]:
x

'Afghanistan'

In [55]:
last_rank.country_full[5622]

'Afghanistan'

In [56]:
x == last_rank.country_full

5622        True
14692      False
23762      False
30932      False
38802      False
           ...  
1788692    False
1797762    False
1806832    False
1821587    False
1830657    False
Name: country_full, Length: 211, dtype: bool

In [57]:
y in last_rank.values

True

In [58]:
y = "afghanistan"

In [59]:
y == last_rank.country_full[5622]

False

In [60]:
z= x.upper()

In [64]:
z in last_rank['country_full'].values


False

In [63]:
last_rank['country_full'].str.upper()

5622          AFGHANISTAN
14692             ALBANIA
23762             ALGERIA
30932      AMERICAN SAMOA
38802             ANDORRA
                ...      
1788692           VIETNAM
1797762             WALES
1806832             YEMEN
1821587            ZAMBIA
1830657          ZIMBABWE
Name: country_full, Length: 211, dtype: object

In [73]:
z = 'Afghanistan'
z = z.upper()

In [74]:
z

'AFGHANISTAN'

In [69]:
z in last_rank.values

False

In [79]:
z in last_rank.country_full.str.upper().values

True