In [1]:
# Importing Required Packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import shapiro
%matplotlib inline

In [2]:
# Reading the Data from the directory

data = pd.read_csv('StatisticalTest_Data.csv')
data.head()

Unnamed: 0,Name,Maths,Physics,Chemistry,Biology,ComputerScience,Bengali,English,History,Geography,Statistics,Gender
0,Aniket,37,34,24,15,48,45,15,45,45,45,M
1,Aarav,32,34,48,25,14,45,14,12,12,12,M
2,Aashi,32,34,45,40,15,23,45,46,13,46,F
3,Aayush,35,31,25,41,25,12,12,48,46,12,M
4,Aditi,26,33,45,39,48,47,14,23,48,48,F


In [3]:
# Extracting the Subject Data and storing them In variables

math = data['Maths']
phy = data['Physics']
chem = data['Chemistry']
bio = data['Biology']
cs = data['ComputerScience']

### Statistical Test for the Data Set
* Check Summary Statistics
* Check whether the Data is Normal or not

#### Hypothesis for Shapiro Test:
* Null Hypothesis: The Data is Normally Distributed
* Alternative Hypothesis: The Data is Not Normally Distributed

## Check The Statistical Importance of the Maths Data

In [7]:
# Check Summary Statistics
math.describe()

count    60.000000
mean     33.066667
std       2.927755
min      26.000000
25%      31.000000
50%      33.000000
75%      35.000000
max      41.000000
Name: Maths, dtype: float64

In [10]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(math)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('We Reject the Null Hypothesis. The Data is Not Normal')
else:
    print('We Fail to Reject the Null Hypothesis. The Data is Normal')

Shapiro-Wilk statistic: 0.9657249450683594
p-value: 0.08992432057857513
We Fail to Reject the Null Hypothesis. The Data is Normal


## Check The Statistical Importance of the Physics Data

In [11]:
# Summary Statistics for The Physics Data
phy.describe()

count    60.000000
mean     32.816667
std       3.301215
min      26.000000
25%      31.000000
50%      32.000000
75%      35.000000
max      41.000000
Name: Physics, dtype: float64

In [12]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(phy)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.9747346043586731
p-value: 0.2470366358757019
The Data is Normal


## Check The Statistical Importance of the Chemistry Data

In [13]:
# Summary Statistics for the Chemistry Data
chem.describe()

count    60.000000
mean     31.083333
std      12.072228
min       5.000000
25%      24.000000
50%      28.500000
75%      43.250000
max      50.000000
Name: Chemistry, dtype: float64

In [14]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(chem)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.9432072639465332
p-value: 0.007555411197245121
The Data is Not Normal


## Check The Statistical Importance of the Biology Data

In [16]:
# Summary Statistics for the Biology Data
bio.describe()

count    60.000000
mean     29.566667
std      12.812026
min       5.000000
25%      19.750000
50%      26.500000
75%      40.250000
max      50.000000
Name: Biology, dtype: float64

In [15]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(bio)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.940201997756958
p-value: 0.005529089365154505
The Data is Not Normal


## Check The Statistical Importance of the ComputerScience Data

In [17]:
# Summary Statistics for ComputerScience Data
cs.describe()

count    60.000000
mean     30.833333
std      11.835263
min       5.000000
25%      22.750000
50%      31.000000
75%      41.000000
max      49.000000
Name: ComputerScience, dtype: float64

In [18]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(cs)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.9487504363059998
p-value: 0.013610176742076874
The Data is Not Normal


## Tabulating the Results that We get in This Notebook

In [30]:
# Report for the Maths Data
# Stote it in a Variable Called reportmath

reportmath = {'Mean':[np.mean(data['Maths'])], 'STD':[np.std(data['Maths'])], 
          'P-Value':[0.08], 'SignificanceLevel':[0.05], 'Status':['Normal']}
reportmath = pd.DataFrame(reportmath)
reportmath.index = ['Mathematics']
reportmath

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Mathematics,33.066667,2.903255,0.08,0.05,Normal


In [31]:
# Report for the Physics Data
# Stote it in a Variable Called reportphy

reportphy = {'Mean':[np.mean(data['Physics'])], 'STD':[np.std(data['Physics'])], 
          'P-Value':[0.24], 'SignificanceLevel':[0.05], 'Status':['Normal']}
reportphy = pd.DataFrame(reportphy)
reportphy.index = ['Physics']
reportphy

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Physics,32.816667,3.27359,0.24,0.05,Normal


In [32]:
# Report for the Chemistry Data
# Stote it in a Variable Called reportphy

reportchem = {'Mean':[np.mean(data['Chemistry'])], 'STD':[np.std(data['Chemistry'])], 
          'P-Value':[0.007], 'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reportchem = pd.DataFrame(reportchem)
reportchem.index = ['Chemistry']
reportchem

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Chemistry,31.083333,11.971204,0.007,0.05,Not-Normal


In [33]:
# Report for the Biology Data
# Stote it in a Variable Called reportphy

reportbio = {'Mean':[np.mean(data['Biology'])], 'STD':[np.std(data['Biology'])], 
          'P-Value':[0.005], 'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reportbio = pd.DataFrame(reportbio)
reportbio.index = ['Biology']
reportbio

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Biology,29.566667,12.704811,0.005,0.05,Not-Normal


In [34]:
# Report for the ComputerScience Data
# Stote it in a Variable Called reportphy

reportcs = {'Mean':[np.mean(data['ComputerScience'])], 'STD':[np.std(data['ComputerScience'])], 
          'P-Value':[0.01], 'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reportcs = pd.DataFrame(reportcs)
reportcs.index = ['Computer Science']
reportcs

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Computer Science,30.833333,11.736221,0.01,0.05,Not-Normal


## Complete Report of this Notebook

In [35]:
report1 = pd.concat([reportmath, reportphy, reportchem, reportbio, reportcs], axis = 0)
report1

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Mathematics,33.066667,2.903255,0.08,0.05,Normal
Physics,32.816667,3.27359,0.24,0.05,Normal
Chemistry,31.083333,11.971204,0.007,0.05,Not-Normal
Biology,29.566667,12.704811,0.005,0.05,Not-Normal
Computer Science,30.833333,11.736221,0.01,0.05,Not-Normal


### Exporting the Report for Further Useage

In [38]:
file_path = 'Report1.csv'
report1.to_csv(file_path, index = True)