In [1]:
import pandas as pd
import numpy as np

#### About the dataset:
- The contains details of all matches played by Indian Cricket team of which Sachin was a part of that Game.
- We see that runs by Sachin, no. of fours, no. of sixes, no. of sixes, sr, Inns, Opp teams etc are recorded in the data
- 1 row of the data represents 1 single game played by India for which Sachin was also a part of.

In [2]:
data = pd.read_csv('Sachin_ODI.csv')
data.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,Napier,1995-02-16,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,Hamilton,1995-02-18,South Africa,False,False
2,47,0,65,40,7,0,117.5,2,Australia,Dunedin,1995-02-22,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,Sharjah,1995-04-05,India,True,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Sharjah,1995-04-07,Pakistan,False,False


In [3]:
data.shape

(360, 14)

#### Total matches played by India for which Sachin was also a part of is 360 ODI matches.

## How to get the Sample Space for the data?

In [4]:
data.shape[0]

360

In [5]:
len(data)

360

- The total Sample Space for the data is 360.

## Event A : India winning the match

In [6]:
## For Event A, Find out the sample space

data['Won'].value_counts()

True     184
False    176
Name: Won, dtype: int64

- The Sample Space for India winning the match i.e. Event A is 184

### Probability of Event A:

In [9]:
## Static Approach to find the probability of an Event

p_a = 184 / 360
p_a

0.5111111111111111

In [11]:
### Pandas Approach to find the probability of an Event A

data_won = data[data['Won'] == True]
data_won.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
2,47,0,65,40,7,0,117.5,2,Australia,Dunedin,1995-02-22,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,Sharjah,1995-04-05,India,True,False
5,112,1,137,107,15,1,104.67,2,Sri Lanka,Sharjah,1995-04-09,India,True,True
6,41,0,51,41,5,0,100.0,2,Sri Lanka,Sharjah,1995-04-14,India,True,False
8,39,0,79,51,4,0,76.47,2,New Zealand,Amritsar,1995-11-18,India,True,False


In [12]:
data_won.shape[0]

184

In [13]:
data.shape[0]

360

In [15]:
p_a = data_won.shape[0] / data.shape[0]
p_a

0.5111111111111111

## Event B : India losing the match

In [16]:
## Static appraoch
176 / 360

0.4888888888888889

In [17]:
## Pandas Approach for Event B

data_lose = data[data['Won'] == False]
data_lose.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,Napier,1995-02-16,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,Hamilton,1995-02-18,South Africa,False,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Sharjah,1995-04-07,Pakistan,False,False
7,30,0,30,20,5,0,150.0,1,New Zealand,Jamshedpur,1995-11-15,New Zealand,False,False
10,65,0,96,59,9,1,110.16,2,New Zealand,Nagpur,1995-11-26,New Zealand,False,False


In [18]:
data_lose.shape[0]

176

In [19]:
data.shape[0]

360

In [21]:
p_b = data_lose.shape[0] / data.shape[0]
p_b

0.4888888888888889

In [23]:
## Probability Rules : Event B is just the complement of Event A

p_a_c = 1 - p_a
p_a_c

0.48888888888888893

## Observations :

- India has chances of winning 51% times
- India has chances of losing 48% times 

## Event C : Sachin scoring a Century

In [25]:
### Find out the Sample Space for Event C

data['century'].value_counts()

False    314
True      46
Name: century, dtype: int64

## Probability of Sachin scoring a Century

In [26]:
## Static approach

46 / 360

0.12777777777777777

### Sachin has a chance of scoring a century of only 12.8%

In [28]:
## Pandas Approach

data_cen = data[data['century'] == True]
data_cen

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
5,112,1,137,107,15,1,104.67,2,Sri Lanka,Sharjah,1995-04-09,India,True,True
12,127,1,175,138,15,1,92.02,2,Kenya,Cuttack,1996-02-18,India,True,True
15,137,0,198,137,8,5,100.0,1,Sri Lanka,Delhi,1996-03-02,Sri Lanka,False,True
20,100,0,-,111,9,1,90.09,1,Pakistan,Singapore,1996-04-05,Pakistan,False,True
23,118,0,-,140,8,2,84.28,1,Pakistan,Sharjah,1996-04-15,India,True,True
27,110,0,-,138,5,1,79.71,1,Sri Lanka,Colombo (RPS),1996-08-28,Sri Lanka,False,True
41,114,0,157,126,14,0,90.47,1,South Africa,Mumbai,1996-12-14,India,True,True
47,104,0,121,97,8,1,107.21,2,Zimbabwe,Benoni,1997-02-09,India,True,True
55,117,0,188,137,13,2,85.4,2,New Zealand,Bengaluru,1997-05-14,India,True,True
85,100,0,132,89,5,7,112.35,2,Australia,Kanpur,1998-04-07,India,True,True


In [29]:
data_cen.shape[0]

46

In [30]:
data.shape[0]

360

In [32]:
p_c = data_cen.shape[0] / data.shape[0]
p_c

0.12777777777777777

## Probability of Sachin not scoring a century

In [33]:
p_c_c = 1 - p_c
p_c_c

0.8722222222222222

## Event D : Probability of India Winning and Sachin Scoring Century

In [37]:
## Find out the Sample Space for the Event D

data_iwsc = data[(data['Won'] == True) & (data['century'] == True)]
data_iwsc

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
5,112,1,137,107,15,1,104.67,2,Sri Lanka,Sharjah,1995-04-09,India,True,True
12,127,1,175,138,15,1,92.02,2,Kenya,Cuttack,1996-02-18,India,True,True
23,118,0,-,140,8,2,84.28,1,Pakistan,Sharjah,1996-04-15,India,True,True
41,114,0,157,126,14,0,90.47,1,South Africa,Mumbai,1996-12-14,India,True,True
47,104,0,121,97,8,1,107.21,2,Zimbabwe,Benoni,1997-02-09,India,True,True
55,117,0,188,137,13,2,85.4,2,New Zealand,Bengaluru,1997-05-14,India,True,True
85,100,0,132,89,5,7,112.35,2,Australia,Kanpur,1998-04-07,India,True,True
92,134,0,195,131,12,3,102.29,2,Australia,Sharjah,1998-04-24,India,True,True
95,100,1,156,103,13,0,97.08,2,Kenya,Kolkata,1998-05-31,India,True,True
99,128,0,182,131,8,2,97.7,1,Sri Lanka,Colombo (RPS),1998-07-07,India,True,True


In [38]:
data_iwsc.shape[0]

30

In [39]:
data.shape[0]

360

In [41]:
p_iwsc = data_iwsc.shape[0] / data.shape[0]
p_iwsc

0.08333333333333333

## Observations:

- There is only 8% chances that India will win the match if Sachin is scoring a century in that match
- There is only 8% chance that Sachin might have scored a century if India has won the match

## Simpler way to calculate probabilities : Probability of India Winning and Sachin scoring a century

In [43]:
# pd.crosstab(index = data['Won'], columns = data['century'])
pd.crosstab(index = data['century'], columns = data['Won'], margins = True)

Won,False,True,All
century,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,160,154,314
True,16,30,46
All,176,184,360


In [44]:
30 / 360

0.08333333333333333

## Find the probability of India Winning given that Sachin has already scored a century

A : Sachin scoring a century  
B : India winning the match  

P(B|A) i.e. P(India winning the match | Sachin has already scored a century)


In [45]:
# pd.crosstab(index = data['Won'], columns = data['century'])
pd.crosstab(index = data['century'], columns = data['Won'], margins = True)

Won,False,True,All
century,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,160,154,314
True,16,30,46
All,176,184,360


- Sample Space of Sachin already scoring a century = 46
- Sample Space of India winning given that Sachin already scored a century = 30

In [46]:
30 / 46

0.6521739130434783

## Observations:

- India has a 65% chance of winning the game if Sachin has already scored a century

In [51]:
### Pandas Approach 

data_cen = data[data['century'] == True]
data_cen.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
5,112,1,137,107,15,1,104.67,2,Sri Lanka,Sharjah,1995-04-09,India,True,True
12,127,1,175,138,15,1,92.02,2,Kenya,Cuttack,1996-02-18,India,True,True
15,137,0,198,137,8,5,100.0,1,Sri Lanka,Delhi,1996-03-02,Sri Lanka,False,True
20,100,0,-,111,9,1,90.09,1,Pakistan,Singapore,1996-04-05,Pakistan,False,True
23,118,0,-,140,8,2,84.28,1,Pakistan,Sharjah,1996-04-15,India,True,True


In [53]:
data_cen.shape[0]

46

In [56]:
data_cen_won = data_cen[data_cen['Won'] == True]
data_cen_won.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
5,112,1,137,107,15,1,104.67,2,Sri Lanka,Sharjah,1995-04-09,India,True,True
12,127,1,175,138,15,1,92.02,2,Kenya,Cuttack,1996-02-18,India,True,True
23,118,0,-,140,8,2,84.28,1,Pakistan,Sharjah,1996-04-15,India,True,True
41,114,0,157,126,14,0,90.47,1,South Africa,Mumbai,1996-12-14,India,True,True
47,104,0,121,97,8,1,107.21,2,Zimbabwe,Benoni,1997-02-09,India,True,True


In [58]:
data_cen_won.shape[0]

30

In [59]:
data_cen_won.shape[0] / data_cen.shape[0]

0.6521739130434783

## Naive Bayes Algorithm:

- The foundation principle on which Naive Bayes is dependent upon is Conditional Probability


#### What is Naive Bayes?
- Naive is classification and Regression Algorithm
- Widely used for classification
- Widely used when the data is very big
- Three types in Naive Bayes - Guassian NB, Multinomial NB, Bernoulli NB