In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import warnings

In [2]:
warnings.filterwarnings(action="ignore")

In [3]:
data=pd.read_csv("Sachin_ODI.csv")

In [4]:
data.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,Napier,1995-02-16,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,Hamilton,1995-02-18,South Africa,False,False
2,47,0,65,40,7,0,117.5,2,Australia,Dunedin,1995-02-22,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,Sharjah,1995-04-05,India,True,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Sharjah,1995-04-07,Pakistan,False,False


In [5]:
data.shape

(360, 14)

In [6]:
data.isna().sum()

runs       0
NotOut     0
mins       0
bf         0
fours      0
sixes      0
sr         0
Inns       0
Opp        0
Ground     0
Date       0
Winner     0
Won        0
century    0
dtype: int64

In [7]:
np.any(data.duplicated())

False

In [8]:
data.describe()

Unnamed: 0,runs,NotOut,bf,fours,sixes,sr,Inns
count,360.0,360.0,360.0,360.0,360.0,360.0,360.0
mean,43.241667,0.083333,49.394444,4.858333,0.466667,75.295056,1.527778
std,42.182452,0.27677,41.348576,4.7229,1.057669,34.980677,0.499923
min,0.0,0.0,2.0,0.0,0.0,0.0,1.0
25%,8.0,0.0,15.0,1.0,0.0,57.03,1.0
50%,30.0,0.0,35.0,3.0,0.0,76.33,2.0
75%,67.0,0.0,73.5,8.0,1.0,97.8225,2.0
max,200.0,1.0,153.0,25.0,7.0,196.55,2.0


In [9]:
data.describe(include="object")

Unnamed: 0,mins,Opp,Ground,Date,Winner
count,360,360,360,360,360
unique,154,15,85,360,13
top,-,Sri Lanka,Sharjah,1995-02-16,India
freq,22,63,31,1,184


In [10]:
data.dtypes

runs         int64
NotOut       int64
mins        object
bf           int64
fours        int64
sixes        int64
sr         float64
Inns         int64
Opp         object
Ground      object
Date        object
Winner      object
Won           bool
century       bool
dtype: object

In [11]:
data["Date"]=pd.to_datetime(data["Date"],format="%Y-%m-%d")

In [12]:
data["Inns"]=data["Inns"].astype(object)

In [13]:
data["month"]=data["Date"].dt.month
data["year"]=data["Date"].dt.year

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

### All calculations are done at a 5 percent signifance level.

In [14]:
alpha=0.05

In [15]:
def crosstab(x,y):
    """
    This function returns a crosstab for two categorical columns from the dataset
    """
    return pd.crosstab(index=data[x],columns=data[y])

In [16]:
def chi2test(x,y):
    """
    This function returns p-value for chi-square test between two categorical columns
    """
    _,p,_,_=stats.chi2_contingency(pd.crosstab(index=data[x],columns=data[y]))
    return p

In [17]:
def accept_reject(p):
    """
    This function prints if the null hypothesis should be accepted or rejected at global alpha level
    """
    global alpha
    if p<alpha:
        x="Reject H0"
    else:
        x="Failed to Reject H0"
    print(f"p-value= {p}")
    print(f"{x} at a {alpha*100}% significance level")    

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [18]:
crosstab("Won","century")

century,False,True
Won,Unnamed: 1_level_1,Unnamed: 2_level_1
False,160,16
True,154,30


H0: There is no statistically significant relationship between Sachin hitting a century and team winning.

HA: There is a statistically significant relationship between Sachin hitting a century and team winning 

Significance level= 5%

In [19]:
p=chi2test("Won","century")
accept_reject(p)

p-value= 0.05856653223833202
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that There is no statistically significant relationship between Sachin hitting a century and team winning.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [20]:
crosstab("NotOut","Won")

Won,False,True
NotOut,Unnamed: 1_level_1,Unnamed: 2_level_1
0,172,158
1,4,26


H0: There is no statistically significant relationship between Sachin remaining not out and team winning.

HA: There is a statistically significant relationship between Sachin remaining not out and team winning.

Significance level= 5%

In [21]:
p=chi2test("NotOut","Won")
accept_reject(p)

p-value= 0.0001051577146484091
Reject H0 at a 5.0% significance level


Hence, There is a statistically significant relationship between Sachin remaining not out and team winning.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [22]:
crosstab("Ground","century")

century,False,True
Ground,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelaide,5,0
Ahmedabad,3,1
Amritsar,1,0
Auckland,2,0
Belfast,4,0
...,...,...
The Oval,4,0
Toronto,11,0
Vadodara,4,2
Visakhapatnam,2,0


H0: There is no statistically significant relationship between Sachin's century and ground.

HA: There is a statistically significant relationship between Sachin's century and ground.

Significance level= 5%

In [23]:
p=chi2test("Ground","century")
accept_reject(p)

p-value= 0.15959923768897666
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that There is no statistically significant relationship between Sachin's century and the ground.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [24]:
crosstab("Opp","century")

century,False,True
Opp,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,53,8
Bangladesh,9,1
Bermuda,1,0
England,23,2
Ireland,2,0
Kenya,5,4
Namibia,0,1
Netherlands,2,0
New Zealand,28,4
Pakistan,54,5


H0: There is no significant relationship relationship between Sachin's century and Opposition.

HA: There is a statistically significant relationship between Sachin's century and Opposition.

Significance level= 5%

In [25]:
p=chi2test("Opp","century")
accept_reject(p)

p-value= 0.20159960746150968
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that There is no statistically significant relationship between Sachin's century and Opposition.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [26]:
crosstab("month","century")

century,False,True
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,37,1
2,43,6
3,36,8
4,26,7
5,9,3
6,16,0
7,14,4
8,11,2
9,34,3
10,34,5


H0: There is no significant relationship relationship between Sachin hitting a century and Month of the year.

HA: There is a statistically significant relationship between Sachin hitting a century and Month of the year.

Significance level= 5%

In [27]:
p=chi2test("month","century")
accept_reject(p)

p-value= 0.28852323779846606
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that There is no significant relationship relationship between Sachin hitting a century and Month of the year.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [28]:
data.groupby("Won").agg(avg_runs=("runs","mean"))

Unnamed: 0_level_0,avg_runs
Won,Unnamed: 1_level_1
False,35.130682
True,51.0


H0: There is no difference in average runs scored by Sachin in Won/Lost matches.
    
HA: Average runs scored by Sachin in Won matches is greater than those scored in lost matches.

Significance Level= 5%

In [29]:
won_runs=data.loc[data["Won"]==True,"runs"]
loss_runs=data.loc[data["Won"]==False,"runs"]

In [30]:
_,p=stats.ttest_ind(won_runs,loss_runs,alternative="greater")
accept_reject(p)

p-value= 0.00016353077486826558
Reject H0 at a 5.0% significance level


Therefore, Average runs scored by Sachin in Won matches is greater than those scored in lost matches.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [76]:
data.groupby("Opp").agg(avg_runs=("runs","mean"))

Unnamed: 0_level_0,avg_runs
Opp,Unnamed: 1_level_1
Australia,44.786885
Bangladesh,46.0
Bermuda,57.0
England,46.72
Ireland,21.0
Kenya,71.888889
Namibia,152.0
Netherlands,39.5
New Zealand,39.15625
Pakistan,38.20339


In [63]:
opp=data["Opp"].unique()
opp_var_name=list(map(lambda i: i.replace(" ","_"),opp))
opp_var_name=list(map(lambda i: i.replace(".","_"),opp_var_name))

In [70]:
for val,var in zip(opp,opp_var_name):
    globals()[var]=data.loc[data["Opp"]==val,"runs"]

In [71]:
#generating parameter for kruskal test
for index,val in enumerate(opp_var_name):
    if index!=len(opp_var_name)-1:
        print(val,end=",")
    else:
        print(val)

New_Zealand,South_Africa,Australia,Bangladesh,Pakistan,Sri_Lanka,Kenya,West_Indies,Zimbabwe,England,Netherlands,Namibia,U_A_E_,Bermuda,Ireland


H0: There is no statistically significant difference in average runs scored by Sachin Against different opponents.
    
HA: There is a statistically significant difference in average runs scored by Sachin Against different opponents..

Significance Level= 5%

In [73]:
_,p=stats.kruskal(New_Zealand,South_Africa,Australia,Bangladesh,Pakistan,Sri_Lanka,Kenya,
              West_Indies,Zimbabwe,England,Netherlands,Namibia,U_A_E_,Bermuda,Ireland)
accept_reject(p)

p-value= 0.7033827783014067
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that there is no statistically significant difference in average runs scored by Sachin Against different opponents.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [31]:
data.groupby("Inns").agg(avg_runs=("runs","mean"))

Unnamed: 0_level_0,avg_runs
Inns,Unnamed: 1_level_1
1,46.670588
2,40.173684


H0: There is no statistically significant difference between the Average runs scored by Sachin in First and Second Innings.
    
HA: There is a statistically significant difference between the Average runs scored by Sachin in First and Second Innings.

Significance Level= 5%

In [32]:
first_inns_runs=data.loc[data["Inns"]=="1","runs"]
second_inns_runs=data.loc[data["Inns"]=="2","runs"]

In [33]:
_,p=stats.ttest_ind(first_inns_runs,second_inns_runs)
accept_reject(p)

p-value= nan
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that there is no statistically significant difference between the Average runs scored by Sachin in First and Second Innings.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [34]:
data.groupby("Inns").agg(avg_strikerate=("sr","mean"))

Unnamed: 0_level_0,avg_strikerate
Inns,Unnamed: 1_level_1
1,73.133235
2,77.229316


H0: There is no statistically significant difference between the Average Strike Rate of Sachin across First and Second Innings.
    
HA: There is a statistically significant difference between the Average Strike Rate of Sachin across First and Second Innings.

Significance Level= 5%

In [35]:
first_inns_sr=data.loc[data["Inns"]=="1","sr"]
second_inns_sr=data.loc[data["Inns"]=="2","sr"]

In [36]:
_,p=stats.ttest_ind(first_inns_sr,second_inns_sr)
accept_reject(p)

p-value= nan
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that there is no statistically significant difference between the Average Strike Rate of Sachin across First and Second Innings.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [37]:
data.groupby("Inns").agg(avg_balls_faced=("bf","mean"))

Unnamed: 0_level_0,avg_balls_faced
Inns,Unnamed: 1_level_1
1,54.405882
2,44.910526


H0: There is no statistically significant difference between the Average Balls Faced by Sachin during First and Second Innings.
    
HA: There is a statistically significant difference between the Average Balls Faced by Sachin during First and Second Innings.

Significance Level= 5%

In [38]:
first_inns_bf=data.loc[data["Inns"]=="1","bf"]
second_inns_bf=data.loc[data["Inns"]=="2","bf"]

In [39]:
_,p=stats.ttest_ind(first_inns_bf,second_inns_bf,alternative="greater")
accept_reject(p)

p-value= nan
Failed to Reject H0 at a 5.0% significance level


Hence, There is a statistically significant difference between the Average Balls Faced by Sachin during First and Second Innings.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [40]:
data.groupby("month").agg(avg_runs=("runs","mean"))

Unnamed: 0_level_0,avg_runs
month,Unnamed: 1_level_1
1,32.026316
2,41.734694
3,52.272727
4,43.848485
5,40.416667
6,48.0
7,48.388889
8,48.692308
9,36.243243
10,44.897436


H0: Runs scored is Normally Distributed

HA: Runs scored is not Normally Distributed

Significance Level: 5%

In [41]:
_,p=stats.normaltest(data["runs"])
accept_reject(p)

p-value= 3.7059399471249535e-12
Reject H0 at a 5.0% significance level


Hence, Runs scored is not Normally Distributed

In [42]:
for i in list(data["Date"].dt.month_name().unique()):
    globals()[i]=data.loc[data["Date"].dt.month_name()==i,"runs"]

In [43]:
#generating parameter for kruskal test
for index,mon in enumerate(list(data["Date"].dt.month_name().unique())):
    if index!=len(data["Date"].dt.month_name().unique())-1:
        print(mon,end=",")
    else:
        print(mon)

February,April,November,March,May,August,September,October,December,January,July,June


H0: There is no statistically significant difference in the average runs scored by Sachin during different months of the year.

H0: There is a statistically significant difference in the average runs scored by Sachin during different months of the year.

Significance Level: 5%

In [44]:
_,p=stats.kruskal(February,April,November,March,May,August,September,October,December,January,July,June)
accept_reject(p)

p-value= 0.5396563116726798
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that there is no statistically significant difference in the average runs scored by Sachin during different months of the year.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------

In [45]:
mod_data=data[data["mins"]!="-"]

In [46]:
mod_data["mins"]=mod_data["mins"].astype("float")

In [47]:
mod_data.groupby("Inns").agg(avg_mins=("mins","mean"))

Unnamed: 0_level_0,avg_mins
Inns,Unnamed: 1_level_1
1,79.751592
2,70.751381


H0: There is no statistically significant difference between the Average Minutes batted by Sachin during First and Second Innings.
    
HA: There is a statistically significant difference between the Average Minutes batted by Sachin during First and Second Innings.

Significance Level= 5%

In [48]:
first_inns_mins=mod_data.loc[mod_data["Inns"]=="1","mins"]
second_inns_mins=mod_data.loc[mod_data["Inns"]=="2","mins"]

In [49]:
_,p=stats.ttest_ind(first_inns_mins,second_inns_mins)
accept_reject(p)

p-value= nan
Failed to Reject H0 at a 5.0% significance level


Hence, We accept that there is no statistically significant difference between the Average Minutes batted by Sachin during First and Second Innings.

--------------------------------------------------------------------------------------------------------------------------------

--------------------------------------------------------------------------------------------------------------------------------