# Predicting finalist teams and winner of the tournament

## Setting up the dataset

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings

# To ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
# !unzip /content/archive.zip

In [None]:
df = pd.read_csv('/content/ODI_Match_info.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2436 entries, 0 to 2435
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   city            2126 non-null   object
 1   date            2436 non-null   object
 2   team1           2436 non-null   object
 3   team2           2436 non-null   object
 4   venue           2436 non-null   object
 5   toss_winner     2436 non-null   object
 6   toss_decision   2436 non-null   object
 7   win_by_runs     2436 non-null   int64 
 8   win_by_wickets  2436 non-null   int64 
 9   winner          2316 non-null   object
dtypes: int64(2), object(8)
memory usage: 190.4+ KB


In [None]:
df.columns

Index(['city', 'date', 'team1', 'team2', 'venue', 'toss_winner',
       'toss_decision', 'win_by_runs', 'win_by_wickets', 'winner'],
      dtype='object')

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df = df.loc[df['winner'].notna()]

## Creating function to create additional features

In [None]:
# def create_features(df:pd.DataFrame):
#     # If team1 is not the winner of the match, consider it a loss
#     df['win_target'] = (df['team1'] == df['winner']).astype(int)
#     # categorize and transform to numerical data for model
#     df['city_code'] = df['city'].astype('category').cat.codes
#     df['toss_winner_code'] = df['toss_winner'].astype('category').cat.codes
#     # df['toss_winner_code'] = df['toss_winner'].astype('category').cat.codes
#     df['venue_code'] = df['venue'].astype('category').cat.codes

#     # create a mapping to convert team name strings into cat codes for team1 and team2 features
#     all_teams = pd.Series(pd.concat([df.team1,df.team2]).unique()).rename('teams').to_frame()
#     all_teams['team_code'] = all_teams['teams'].astype('category').cat.codes
#     team_map = pd.Series(all_teams.team_code.values,index=all_teams.teams).to_dict()
#     df[['team1_code','team2_code']] = df[['team1','team2']].apply(lambda x: x.map(team_map))
#     df = pd.get_dummies(df, columns=['toss_decision'], drop_first=True)
#     df = df.sort_values('date')

#     return df


def create_features(df: pd.DataFrame):
    # If team1 is not the winner of the match, consider it a loss
    df['win_target'] = (df['team1'] == df['winner']).astype(int)

    # Categorize and transform to numerical data for model
    df['city_code'] = df['city'].astype('category').cat.codes
    df['venue_code'] = df['venue'].astype('category').cat.codes

    # Create a mapping to convert team name strings into cat codes for team1 and team2 features
    all_teams = pd.Series(pd.concat([df.team1, df.team2]).unique()).rename('teams').to_frame()
    all_teams['team_code'] = all_teams['teams'].astype('category').cat.codes
    team_map = pd.Series(all_teams.team_code.values, index=all_teams.teams).to_dict()

    df['team1_code'] = df['team1'].map(team_map)
    df['team2_code'] = df['team2'].map(team_map)

    # Convert toss_winner to categorical and apply label encoding
    df['toss_winner_code'] = df['toss_winner'].map(team_map)

    # Convert toss_decision to categorical and apply one-hot encoding
    df = pd.get_dummies(df, columns=['toss_decision'], drop_first=True)

    df = df.sort_values('date')

    return df



In [None]:
df

Unnamed: 0,city,date,team1,team2,venue,toss_winner,toss_decision,win_by_runs,win_by_wickets,winner
0,Indore,2023-09-24,India,Australia,"Holkar Cricket Stadium, Indore",Australia,field,99,0,India
1,Nottingham,2023-09-23,England,Ireland,"Trent Bridge, Nottingham",Ireland,field,48,0,England
2,Dhaka,2023-09-23,New Zealand,Bangladesh,"Shere Bangla National Stadium, Mirpur",New Zealand,bat,86,0,New Zealand
3,Chandigarh,2023-09-22,Australia,India,"Punjab Cricket Association IS Bindra Stadium, ...",India,field,0,5,India
5,Colombo,2023-09-17,Sri Lanka,India,"R Premadasa Stadium, Colombo",Sri Lanka,bat,0,10,India
...,...,...,...,...,...,...,...,...,...,...
2431,Kolkata,2023-11-16,South Africa,Australia,Eden Gardens,South Africa,field,0,0,Australia
2432,Ahmedabad,2023-11-19,India,South Africa,Narendra Modi Stadium,South Africa,bat,0,0,India
2433,Ahmedabad,2023-11-19,India,South Africa,Narendra Modi Stadium,India,bat,0,0,India
2434,Ahmedabad,2023-11-19,India,South Africa,Narendra Modi Stadium,South Africa,field,0,0,India


In [None]:
df = df.pipe(create_features)

In [None]:
features = ['win_by_runs','win_by_wickets','toss_winner_code','venue_code', 'city_code','team1_code','team2_code','toss_decision_field']

In [None]:
df[features].isna().sum()

win_by_runs            0
win_by_wickets         0
toss_winner_code       0
venue_code             0
city_code              0
team1_code             0
team2_code             0
toss_decision_field    0
dtype: int64

## Defining the ML models

In [None]:
def train_model(data, features):
    rf = RandomForestClassifier(n_estimators=75, min_samples_split=20, random_state=1)
    train = data[data['date'] < '2022-01-01'].copy()
    test = data[data['date'] > '2023-11-05'].copy()
    rf.fit(train[features],train['win_target'])
    preds = rf.predict(test[features])

    combined = pd.DataFrame(dict(actual=test['win_target'], prediction=preds), index=test.index)
    combined = combined.join(test)

    precision = precision_score(test['win_target'],preds)
    accuracy = accuracy_score(test['win_target'], preds)
    return precision, accuracy, combined



In [None]:
import xgboost as xgb

def trainxgbmodel(data,features):
  xgb_model = xgb.XGBClassifier(n_estimators=500, max_depth=5,random_state=100)
  train = data[data['date'] < '2022-01-01'].copy()
  test = data[data['date'] > '2023-11-05'].copy()
  xgb_model.fit(train[features],train['win_target'])
  preds = xgb_model.predict(test[features])
  combined = pd.DataFrame(dict(actual=test['win_target'], prediction=preds), index=test.index)
  combined = combined.join(test)

  precision = precision_score(test['win_target'],preds)
  accuracy = accuracy_score(test['win_target'], preds)
  return precision, accuracy, combined

## Predicting the finalist teams and winner of the tournament with ML models

In [None]:
precision, accuracy, combined = train_model(df, features)

In [None]:
combined[['date','team1','team2','winner','toss_winner_code','toss_decision_field','win_target','prediction']]

Unnamed: 0,date,team1,team2,winner,toss_winner_code,toss_decision_field,win_target,prediction
33,2023-11-07,Afghanistan,Bangladesh,Bangladesh,0,0,0,0
14,2023-11-09,India,Pakistan,India,19,1,1,1
2387,2023-11-10,Afghanistan,India,India,0,0,0,0
2422,2023-11-11,England,Pakistan,England,7,0,1,1
2421,2023-11-11,Bangladesh,Australia,Australia,3,1,0,0
2426,2023-11-15,India,New Zealand,India,17,1,1,1
2425,2023-11-15,India,New Zealand,India,10,0,1,1
2424,2023-11-15,India,New Zealand,India,17,0,1,1
2427,2023-11-15,India,New Zealand,India,10,1,1,1
2429,2023-11-16,South Africa,Australia,Australia,22,0,0,1


In [None]:
print(f"Random Forest Model Accuracy: {(accuracy * 100).round(2)}")
print(f"Random Forest Model Precision: {(precision * 100).round(2)}")

Random Forest Model Accuracy: 83.33
Random Forest Model Precision: 78.95


In [None]:
precision, accuracy, combined = trainxgbmodel(df, features)

In [None]:
print(f"XGBoost model Accuracy: {(accuracy * 100).round(2)}")
print(f"XGboost Model Precision: {(precision * 100).round(2)}")

XGBoost model Accuracy: 75.0
XGboost Model Precision: 90.91


## Creating DNN model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Assuming 'data' is your DataFrame and 'features' contains the features for training
# Assuming 'target' contains the target variable 'win_target'

def train_dnn_classifier(data, features):
    # Split the data into training and testing sets based on the date
    train_data = data[data['date'] < '2022-01-01'].copy()
    test_data = data[data['date'] > '2023-11-05'].copy()

    # Standardize the features
    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_data[features])
    test_features = scaler.transform(test_data[features])

    # Define the neural network model
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=len(features)))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(train_features, train_data['win_target'], epochs=10, batch_size=32, validation_split=0.1)

    # Make predictions on the testing set
    predictions = model.predict(test_features)
    binary_predictions = np.round(predictions)

    # Evaluate the model
    precision = precision_score(test_data['win_target'], binary_predictions)
    accuracy = accuracy_score(test_data['win_target'], binary_predictions)

    # Display evaluation metrics
    print(f"Precision: {(precision * 100).round(2)}")
    print(f"Accuracy: {(accuracy * 100).round(2)}")

    return model

# Usage
trained_dnn_classifier = train_dnn_classifier(df, features)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Precision: 87.5
Accuracy: 87.5


## Creating improved DNN model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers

def improved_dnn_classifier(data, features):
    train_data = data[data['date'] < '2022-01-01'].copy()
    test_data = data[data['date'] > '2023-11-05'].copy()

    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_data[features])
    test_features = scaler.transform(test_data[features])

    model = Sequential()
    model.add(Dense(units=128, activation='relu', input_dim=len(features)))
    model.add(Dropout(0.5))  # Adding dropout for regularization
    model.add(Dense(units=64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(train_features, train_data['win_target'], epochs=15, batch_size=32, validation_split=0.1)

    predictions = model.predict(test_features)
    binary_predictions = np.round(predictions)

    precision = precision_score(test_data['win_target'], binary_predictions)
    accuracy = accuracy_score(test_data['win_target'], binary_predictions)

    print(f"Precision: {precision}")
    print(f"Accuracy: {accuracy}")

    return model


# Usage
trained_dnn_classifier = train_dnn_classifier(df, features)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Precision: 93.75
Accuracy: 95.83


## Predicting the finalist teams and winner of the tournament with DNN model

In [None]:
filtered_rows = combined[(combined['team1'] == 'India') & (combined['team2'] == 'New Zealand')]

# Display the resulting DataFrame
print(filtered_rows[["team1","team2","toss_winner_code","toss_decision_field","win_target","prediction"]])


      team1        team2  toss_winner_code  toss_decision_field  win_target  \
2426  India  New Zealand                17                    1           1   
2425  India  New Zealand                10                    0           1   
2424  India  New Zealand                17                    0           1   
2427  India  New Zealand                10                    1           1   

      prediction  
2426           0  
2425           0  
2424           1  
2427           0  


In [None]:
filtered_rows = combined[(combined['team1'] == 'South Africa') & (combined['team2'] == 'Australia')]

# Display the resulting DataFrame
print(filtered_rows[["team1","team2","toss_winner_code","toss_decision_field","win_target","prediction"]])

             team1      team2  toss_winner_code  toss_decision_field  \
2429  South Africa  Australia                22                    0   
2428  South Africa  Australia                 3                    0   
2431  South Africa  Australia                22                    1   
2430  South Africa  Australia                 3                    1   
12    South Africa  Australia                 3                    1   
2388  South Africa  Australia                 3                    1   

      win_target  prediction  
2429           0           0  
2428           0           1  
2431           0           0  
2430           0           0  
12             1           1  
2388           1           1  


In [None]:
filtered_rows = combined[(combined['team1'] == 'India') & (combined['team2'] == 'South Africa')]

# Display the resulting DataFrame
print(filtered_rows[["team1","team2","toss_winner_code","toss_decision_field","win_target","prediction"]])

      team1         team2  toss_winner_code  toss_decision_field  win_target  \
2432  India  South Africa                22                    0           1   
2433  India  South Africa                10                    0           1   
2434  India  South Africa                22                    1           1   
2435  India  South Africa                10                    1           1   

      prediction  
2432           1  
2433           0  
2434           1  
2435           1  


## Observations

- After adding dropout layers and kernel regularizers, we can see that our model preformed better than the previous model with above 90% accuracy and precision.
- From our models, we can see that the teams that are most likely to be in the finals are India and South Africa.
- If India and South Africa are in the finals, then India is most likely to win the tournament.