In [86]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score

In [87]:
df = pd.read_csv('./ODI_Match_info.csv')

In [88]:
df['date'] = pd.to_datetime(df['date'])

  df['date'] = pd.to_datetime(df['date'])


In [89]:
df = df.loc[df['winner'].notna()]

In [90]:
city_categories = df['city'].astype('category')
venue_categories = df['venue'].astype('category')

In [91]:
encoding_info = None

In [92]:
def create_features(df: pd.DataFrame):
    # If team1 is not the winner of the match, consider it a loss
    df['win_target'] = (df['team1'] == df['winner']).astype(int)

    # Categorize and transform to numerical data for model
    df['city_code'] = city_categories.cat.codes
    df['venue_code'] = venue_categories.cat.codes

    # Create a mapping to convert team name strings into cat codes for team1 and team2 features
    all_teams = pd.Series(pd.concat([df.team1, df.team2]).unique()).rename('teams').to_frame()
    all_teams['team_code'] = all_teams['teams'].astype('category').cat.codes
    team_map = pd.Series(all_teams.team_code.values, index=all_teams.teams).to_dict()
    # print(team_map)

    df['team1_code'] = df['team1'].map(team_map)
    df['team2_code'] = df['team2'].map(team_map)

    # Convert toss_winner to categorical and apply label encoding
    df['toss_winner_code'] = df['toss_winner'].map(team_map)

    # Convert toss_decision to categorical and apply one-hot encoding
    df = pd.get_dummies(df, columns=['toss_decision'], drop_first=True)

    df = df.sort_values('date')

    # Save the encoding information along with the model
    global encoding_info
    encoding_info = {
        'team_map': team_map,
        'city_encoding_categories': city_categories.cat.categories,
        'venue_encoding_categories': venue_categories.cat.categories,
    }
    return df

In [93]:
df = df.pipe(create_features)

In [94]:
df.head()

Unnamed: 0,city,date,team1,team2,venue,toss_winner,win_by_runs,win_by_wickets,winner,win_target,city_code,venue_code,team1_code,team2_code,toss_winner_code,toss_decision_field
2378,Napier,2002-12-29,New Zealand,India,"McLean Park, Napier",India,35,0,New Zealand,1,103,151,17,10,10,True
2377,Christchurch,2003-01-01,India,New Zealand,"Jade Stadium, Christchurch",India,0,5,New Zealand,0,34,113,10,17,10,False
2376,Queenstown,2003-01-04,India,New Zealand,"Davies Park, Queenstown",New Zealand,0,7,New Zealand,0,118,65,10,17,17,True
2375,Wellington,2003-01-08,New Zealand,India,"Westpac Stadium, Wellington",New Zealand,0,2,India,0,142,286,17,10,17,False
2374,Sydney,2003-01-09,Sri Lanka,Australia,Sydney Cricket Ground,Australia,79,0,Sri Lanka,1,131,249,23,3,3,True


In [95]:
features = ['win_by_runs','win_by_wickets','toss_winner_code','venue_code', 'city_code','team1_code','team2_code','toss_decision_field']

In [96]:
xgb_model = None

In [97]:
import xgboost as xgb

def trainxgbmodel(data,features):
  global xgb_model
  xgb_model = xgb.XGBClassifier(n_estimators=500, max_depth=5,random_state=100)
  train = data[data['date'] < '2022-01-01'].copy()
  test = data[data['date'] > '2023-11-05'].copy()
  xgb_model.fit(train[features],train['win_target'])
  preds = xgb_model.predict(test[features])
  combined = pd.DataFrame(dict(actual=test['win_target'], prediction=preds), index=test.index)
  combined = combined.join(test)

  precision = precision_score(test['win_target'],preds)
  accuracy = accuracy_score(test['win_target'], preds)
  return precision, accuracy, combined

In [98]:
precision, accuracy, combined = trainxgbmodel(df, features)

In [99]:
print(f"XGBoost model Accuracy: {(accuracy * 100).round(2)}")
print(f"XGboost Model Precision: {(precision * 100).round(2)}")

XGBoost model Accuracy: 70.0
XGboost Model Precision: 77.78


In [100]:
model_and_encoding_info = {
    'model': xgb_model,  # Replace 'trained_model' with your actual trained model
    'encoding_info': encoding_info,
}

In [101]:
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(model_and_encoding_info, pickle_out)
#Serialization
pickle_out.close()