<a href="https://colab.research.google.com/github/Zibraan/My_ML_DL_Codes/blob/main/Zibraan_T20_predicition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load the dataset
file_path = 'ICC Mens T20 Worldcup.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Match No.,Date,Venue,1st Team,2nd Team,Stage,Toss Winning,Toss Decision,First Innings Score,Fall of wickets First Innings,...,Winners,Method,Won by,Winning Margin,Top Scorer,Highest Score,Best Bowler,Best Bowler Figure(Wickets Taken),Best Bowler Figure(Runs Recieved),Player Of The Match
0,1st,"Saturday,June 1,2024","Grand Prairie Stadium, Dallas",Canada,United State Of America,Group A,United State Of America,Fielding,194,5,...,United State Of America,Normal Match,Wickets,7,Aaron Jones,94,Dilon Heyliger,1,19,Aaron Jones
1,2nd,"Sunday,June 2,2024","Providence Stadium, Guyana",Papua New Guinea,Afghanistan,Group C,Afghanistan,Fielding,95,10,...,Afghanistan,Normal Match,Wickets,7,Gulbadin Naib,49,Fazalhaq Farooqi,3,16,Fazalhaq Farooqi
2,3rd,"Sunday,June 2,2024","Kensington Oval, Bridgetown, Barbados",Oman,Namibia,Group B,Namibia,Fielding,109,10,...,Namibia,Normal Match,Wickets,4,Jan Frylinck,45,Mehran Khan,3,7,David Wiese
3,4th,"Monday,June 3,2024","Nassau County International Cricket Stadium, N...",Sri Lanka,South Africa,Group D,Sri Lanka,Batting,77,10,...,South Africa,Normal Match,Wickets,6,Quinton de Kock,20,Anrich Nortje,4,7,Anrich Nortje
4,5th,"Monday,June 3,2024","Providence Stadium, Guyana",Afghanistan,Uganda,Group C,Uganda,Fielding,183,5,...,Afghanistan,Normal Match,Runs,125,Rahmanullah Gurbaz,76,Fazalhaq Farooqi,5,9,Fazalhaq Farooqi


Data Preprocessing : Lets begin by inferring if null values are present in the dataset and inquiring their datatypes

In [None]:
# Check for missing values
df.isnull().sum()

Match No.                            0
Date                                 0
Venue                                0
1st Team                             0
2nd Team                             0
Stage                                0
Toss Winning                         0
Toss Decision                        0
First Innings Score                  0
Fall of wickets First Innings        0
Second Innings Score                 0
Fall of wickets Second Innings       0
Winners                              0
Method                               0
Won by                               0
Winning Margin                       0
Top Scorer                           0
Highest Score                        0
Best Bowler                          0
Best Bowler Figure(Wickets Taken)    0
Best Bowler Figure(Runs Recieved)    0
Player Of The Match                  0
dtype: int64

In [None]:
# Check data types
df.dtypes

Match No.                            object
Date                                 object
Venue                                object
1st Team                             object
2nd Team                             object
Stage                                object
Toss Winning                         object
Toss Decision                        object
First Innings Score                  object
Fall of wickets First Innings        object
Second Innings Score                 object
Fall of wickets Second Innings       object
Winners                              object
Method                               object
Won by                               object
Winning Margin                       object
Top Scorer                           object
Highest Score                        object
Best Bowler                          object
Best Bowler Figure(Wickets Taken)    object
Best Bowler Figure(Runs Recieved)    object
Player Of The Match                  object
dtype: object

In [None]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.dtypes

Match No.                                    object
Date                                 datetime64[ns]
Venue                                        object
1st Team                                     object
2nd Team                                     object
Stage                                        object
Toss Winning                                 object
Toss Decision                                object
First Innings Score                          object
Fall of wickets First Innings                object
Second Innings Score                         object
Fall of wickets Second Innings               object
Winners                                      object
Method                                       object
Won by                                       object
Winning Margin                               object
Top Scorer                                   object
Highest Score                                object
Best Bowler                                  object
Best Bowler 

Let's dive into the statististical side of things in order to understand the data better

In [None]:
# Basic statistics
df.describe(include = 'all' )

Unnamed: 0,Match No.,Date,Venue,1st Team,2nd Team,Stage,Toss Winning,Toss Decision,First Innings Score,Fall of wickets First Innings,...,Winners,Method,Won by,Winning Margin,Top Scorer,Highest Score,Best Bowler,Best Bowler Figure(Wickets Taken),Best Bowler Figure(Runs Recieved),Player Of The Match
count,55,20,55,55,55,55,55,55,55.0,55.0,...,55,55,55,55.0,55,55,55,55.0,55.0,55
unique,55,,9,19,18,9,19,3,41.0,9.0,...,17,3,3,29.0,39,38,41,8.0,28.0,44
top,1st,,"Kensington Oval, Bridgetown, Barbados",India,England,Group A,England,Fielding,106.0,10.0,...,South Africa,Normal Match,Runs,7.0,Rain,Rain,Rain,3.0,16.0,Rain
freq,1,,9,6,7,11,6,42,4.0,16.0,...,8,48,26,9.0,4,4,4,25.0,6.0,4
mean,,2024-06-05 10:48:00,,,,,,,,,...,,,,,,,,,,
min,,2024-06-01 00:00:00,,,,,,,,,...,,,,,,,,,,
25%,,2024-06-03 18:00:00,,,,,,,,,...,,,,,,,,,,
50%,,2024-06-05 12:00:00,,,,,,,,,...,,,,,,,,,,
75%,,2024-06-07 06:00:00,,,,,,,,,...,,,,,,,,,,
max,,2024-06-09 00:00:00,,,,,,,,,...,,,,,,,,,,


Correlation Analysis¶
Let's see if there are any interesting correlations in the numeric data.


In [None]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Check if numeric_df is empty and handle it
if numeric_df.empty:
    print("Warning: No numeric columns found in the DataFrame. Cannot create a heatmap.")
else:
    # Compute the correlation matrix
    corr_matrix = numeric_df.corr()

    # Plot the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.show()



Predicting Match Winners¶
Let's see if we can predict the match winners based on the available data.

In [None]:
# Prepare the data for prediction
df['Winner'] = df.apply(lambda row: 1 if row['Winners'] == row['1st Team'] else 0, axis=1)
features = ['Toss Winning', 'Toss Decision', 'First Innings Score', 'Second Innings Score']
X = pd.get_dummies(df[features], drop_first=True)
y = df['Winner']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, conf_matrix, class_report

(0.35294117647058826,
 array([[4, 8],
        [3, 2]]),
 '              precision    recall  f1-score   support\n\n           0       0.57      0.33      0.42        12\n           1       0.20      0.40      0.27         5\n\n    accuracy                           0.35        17\n   macro avg       0.39      0.37      0.34        17\nweighted avg       0.46      0.35      0.38        17\n')