## Loading the standard libraries

In [1]:
import pandas as pd
import numpy as np

## Loading the data

In [2]:
data = pd.read_csv('Sachin_ODI.csv')
data.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,Napier,1995-02-16,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,Hamilton,1995-02-18,South Africa,False,False
2,47,0,65,40,7,0,117.5,2,Australia,Dunedin,1995-02-22,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,Sharjah,1995-04-05,India,True,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Sharjah,1995-04-07,Pakistan,False,False


In [3]:
data.shape

(360, 14)

## Observations:

- We are looking at 360 ODI matches played by Sachin
- sr means strike rate
- bf means balls faced

In [4]:
data = data.drop(['Ground', 'Date'], axis = 1)
data.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,South Africa,False,False
2,47,0,65,40,7,0,117.5,2,Australia,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,India,True,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Pakistan,False,False


1. If total matches played in the dataset is 360, it means that the total Sample Space = 360

## How to get the Sample Space of the data:

In [5]:
data.shape[0]

360

In [6]:
len(data)

360

## Event A : India winning the match

In [7]:
## Getting the sample space for event A

data['Won'].value_counts()

True     184
False    176
Name: Won, dtype: int64

1. The Sample Space for India winning the match is 184

## Probability of Event A :

In [8]:
## Basic Approach or manual approach

184 / 360

0.5111111111111111

In [9]:
## Pandas Approach

df_won = data[data['Won'] == True]
df_won.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Winner,Won,century
2,47,0,65,40,7,0,117.5,2,Australia,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,India,True,False
5,112,1,137,107,15,1,104.67,2,Sri Lanka,India,True,True
6,41,0,51,41,5,0,100.0,2,Sri Lanka,India,True,False
8,39,0,79,51,4,0,76.47,2,New Zealand,India,True,False


In [10]:
df_won.shape

(184, 12)

In [11]:
p_a = len(df_won) / len(data)
p_a

0.5111111111111111

## Observation:

- 0.51 (probability) represents that 50% is the winning rate for India

## Probability of losing

In [12]:
1 - p_a

0.48888888888888893

## Observation:

- Probability of losing is negative of Probability of winning
- P(A') = 1 - P(A)                [A' means A complement]
- 0.49 (probability) indicates that 49% is the losing rate for India

In [15]:
### Probability of losing using Pandas Approach

df_lose = data[data['Won'] == False]
df_lose

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,South Africa,False,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Pakistan,False,False
7,30,0,30,20,5,0,150.00,1,New Zealand,New Zealand,False,False
10,65,0,96,59,9,1,110.16,2,New Zealand,New Zealand,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
352,15,0,25,24,2,0,62.50,2,Sri Lanka,tied,False,False
353,3,0,21,12,0,0,25.00,2,Australia,Australia,False,False
354,22,0,33,23,3,0,95.65,2,Sri Lanka,Sri Lanka,False,False
355,14,0,34,15,2,0,93.33,2,Australia,Australia,False,False


In [16]:
len(df_lose)

176

In [18]:
p_lose = len(df_lose) / len(data)
p_lose

0.4888888888888889

## Event B : Sachin Scoring a Century

In [19]:
### Sample Space for Sachin Scoring Century

data['century'].value_counts()

False    314
True      46
Name: century, dtype: int64

In [20]:
p_cen = 46 / 360
p_cen

0.12777777777777777

## Observations:

- 12.77 ~ 13% is the century rate for Sachin.

## Pandas Approach for Event B

In [21]:
df_century = data[data['century'] == True]
df_century

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Winner,Won,century
5,112,1,137,107,15,1,104.67,2,Sri Lanka,India,True,True
12,127,1,175,138,15,1,92.02,2,Kenya,India,True,True
15,137,0,198,137,8,5,100.0,1,Sri Lanka,Sri Lanka,False,True
20,100,0,-,111,9,1,90.09,1,Pakistan,Pakistan,False,True
23,118,0,-,140,8,2,84.28,1,Pakistan,India,True,True
27,110,0,-,138,5,1,79.71,1,Sri Lanka,Sri Lanka,False,True
41,114,0,157,126,14,0,90.47,1,South Africa,India,True,True
47,104,0,121,97,8,1,107.21,2,Zimbabwe,India,True,True
55,117,0,188,137,13,2,85.4,2,New Zealand,India,True,True
85,100,0,132,89,5,7,112.35,2,Australia,India,True,True


In [22]:
len(df_century)

46

In [24]:
p_century = len(df_century) / len(data)
p_century

0.12777777777777777

## Probability for sachin not scoring a century

In [25]:
1 - p_century

0.8722222222222222

## Probability of India Winning and Sachin Scoring a century

In [26]:
data.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,South Africa,False,False
2,47,0,65,40,7,0,117.5,2,Australia,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,India,True,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Pakistan,False,False


In [28]:
df_iwsc = data[(data['Won'] == True) & (data['century'] == True)]
df_iwsc

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Winner,Won,century
5,112,1,137,107,15,1,104.67,2,Sri Lanka,India,True,True
12,127,1,175,138,15,1,92.02,2,Kenya,India,True,True
23,118,0,-,140,8,2,84.28,1,Pakistan,India,True,True
41,114,0,157,126,14,0,90.47,1,South Africa,India,True,True
47,104,0,121,97,8,1,107.21,2,Zimbabwe,India,True,True
55,117,0,188,137,13,2,85.4,2,New Zealand,India,True,True
85,100,0,132,89,5,7,112.35,2,Australia,India,True,True
92,134,0,195,131,12,3,102.29,2,Australia,India,True,True
95,100,1,156,103,13,0,97.08,2,Kenya,India,True,True
99,128,0,182,131,8,2,97.7,1,Sri Lanka,India,True,True


In [30]:
## Sample space for India winning and sachin scoring the century

len(df_iwsc)

30

In [32]:
## Prob

p_iwsc = len(df_iwsc) / len(data)
p_iwsc

0.08333333333333333

## Observations:

- 8.3% of times India wins when Sachin scores century

## Probability of India Winning and Sachin Scoring a century using a simple approach

In [34]:
pd.crosstab(index = data['century'], columns = data['Won'], margins = True)

Won,False,True,All
century,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,160,154,314
True,16,30,46
All,176,184,360


In [35]:
30 / 360

0.08333333333333333

## Find the probability of India Winning given that Sachin has already scored a century in that match

A : Sachin Scoring Century   
B : India winning the match

P(B | A) ==> P(India winning the match | Sachin has already scored century)

In [36]:
pd.crosstab(index = data['century'], columns = data['Won'], margins = True)

Won,False,True,All
century,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,160,154,314
True,16,30,46
All,176,184,360


Sample Space (Sachin scoring a century) = 46  
Sample Space (India Winning out of 46 matches) = 30


Prob(B | A) = 30 / 46

In [37]:
p_b_a = 30 /46
p_b_a

0.6521739130434783

## Observations:

- 65% of times india wins a match when sachin has scored a century

## Probability of Sachin Scoring a century given that India has already lost that match

Event A : India Losing the match  
Event B : Sachin scoring a century

P(B | A) ==> P(Sachin Scoring a century | India has lost the match)