In [1]:
# Importing Required Packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import shapiro
%matplotlib inline

In [2]:
# Reading the Data from the directory

data = pd.read_csv('StatisticalTest_Data.csv')
data.head()

Unnamed: 0,Name,Maths,Physics,Chemistry,Biology,ComputerScience,Bengali,English,History,Geography,Statistics,Gender
0,Aniket,37,34,24,15,48,45,15,45,45,45,M
1,Aarav,32,34,48,25,14,45,14,12,12,12,M
2,Aashi,32,34,45,40,15,23,45,46,13,46,F
3,Aayush,35,31,25,41,25,12,12,48,46,12,M
4,Aditi,26,33,45,39,48,47,14,23,48,48,F


In [3]:
# Extracting the Data for last 5 subjects

ben = data['Bengali']
eng = data['English']
his = data['History']
geo = data['Geography']
stat = data['Statistics']

### Statistical Test for the Data Set
* Check Summary Statistics
* Check whether the Data is Normal or not

#### Hypothesis for Shapiro Test:
* Null Hypothesis: The Data is Normally Distributed
* Alternative Hypothesis: The Data is Not Normally Distributed

## Check The Statistical Importance of the Bengali Data

In [4]:
# Summary Statistics for Bengali Data
ben.describe()

count    60.000000
mean     29.133333
std      12.288160
min       5.000000
25%      21.000000
50%      28.000000
75%      42.000000
max      48.000000
Name: Bengali, dtype: float64

In [5]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(ben)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.9345240592956543
p-value: 0.0031059749890118837
The Data is Not Normal


## Check The Statistical Importance of the English Data

In [6]:
# Summary Statistics
eng.describe()

count    60.000000
mean     30.250000
std      13.607855
min       5.000000
25%      16.500000
50%      30.000000
75%      45.000000
max      49.000000
Name: English, dtype: float64

In [7]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(eng)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.900098443031311
p-value: 0.00013262411812320352
The Data is Not Normal


## Check The Statistical Importance of the History Data

In [8]:
# Summary Statistics
his.describe()

count    60.000000
mean     28.950000
std      12.482428
min       5.000000
25%      19.000000
50%      27.500000
75%      45.000000
max      48.000000
Name: History, dtype: float64

In [9]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(his)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.9257447719573975
p-value: 0.0013166321441531181
The Data is Not Normal


## Check The Statistical Importance of the Geography Data

In [10]:
# Summary Statistics
geo.describe()

count    60.000000
mean     28.316667
std      12.213485
min       5.000000
25%      21.000000
50%      28.000000
75%      38.250000
max      48.000000
Name: Geography, dtype: float64

In [11]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

stat, p = shapiro(geo)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.9394094944000244
p-value: 0.005096178501844406
The Data is Not Normal


## Check The Statistical Importance of the Geography Data

In [13]:
data['Statistics'].describe()

count    60.000000
mean     31.000000
std      11.748783
min       6.000000
25%      24.000000
50%      34.000000
75%      39.000000
max      48.000000
Name: Statistics, dtype: float64

In [15]:
# Shapiro Test to Check Normality of the data
# Consider the Significance Level is 0.05

statdata = data['Statistics']  
stat, p = shapiro(statdata)
print('Shapiro-Wilk statistic:', stat)
print('p-value:', p)

# Status of the Test
if p <= 0.05:
    print('The Data is Not Normal')
else:
    print('The Data is Normal')

Shapiro-Wilk statistic: 0.9237614870071411
p-value: 0.0010905155213549733
The Data is Not Normal


## Tabulating the Results We get in this Notebook

In [16]:
# Report for the Bengali Data
# Stote it in a Variable Called reportmath

reportben = {'Mean':[np.mean(data['Bengali'])], 'STD':[np.std(data['Bengali'])], 
          'P-Value':[0.003], 'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reportben = pd.DataFrame(reportben)
reportben.index = ['Bengali']
reportben

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Bengali,29.133333,12.185328,0.003,0.05,Not-Normal


In [17]:
# Report for the English Data
# Stote it in a Variable Called reportmath

reporteng = {'Mean':[np.mean(data['English'])], 'STD':[np.std(data['English'])], 
          'P-Value':[0.0001],'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reporteng = pd.DataFrame(reporteng)
reporteng.index = ['English']
reporteng

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
English,30.25,13.49398,0.0001,0.05,Not-Normal


In [18]:
# Report for the History Data
# Stote it in a Variable Called reportmath

reporthis = {'Mean':[np.mean(data['History'])], 'STD':[np.std(data['History'])], 
          'P-Value':[0.001],'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reporthis = pd.DataFrame(reporthis)
reporthis.index = ['History']
reporthis

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
History,28.95,12.377971,0.001,0.05,Not-Normal


In [19]:
# Report for the Geography Data
# Stote it in a Variable Called reportmath

reportgeo = {'Mean':[np.mean(data['Geography'])], 'STD':[np.std(data['Geography'])], 
          'P-Value':[0.005],'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reportgeo = pd.DataFrame(reportgeo)
reportgeo.index = ['Geography']
reportgeo

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Geography,28.316667,12.111278,0.005,0.05,Not-Normal


In [20]:
    # Report for the Statistics Data
# Stote it in a Variable Called reportmath

reportstat = {'Mean':[np.mean(data['Statistics'])], 'STD':[np.std(data['Statistics'])], 
          'P-Value':[0.001],'SignificanceLevel':[0.05], 'Status':['Not-Normal']}
reportstat = pd.DataFrame(reportstat)
reportstat.index = ['Statistics']
reportstat

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Statistics,31.0,11.650465,0.001,0.05,Not-Normal


## Complete Report of this Notebook

In [21]:
report2 = pd.concat([reportben, reporteng, reporthis, reportgeo, reportstat], axis = 0)
report2

Unnamed: 0,Mean,STD,P-Value,SignificanceLevel,Status
Bengali,29.133333,12.185328,0.003,0.05,Not-Normal
English,30.25,13.49398,0.0001,0.05,Not-Normal
History,28.95,12.377971,0.001,0.05,Not-Normal
Geography,28.316667,12.111278,0.005,0.05,Not-Normal
Statistics,31.0,11.650465,0.001,0.05,Not-Normal


### Exporting the Report as a csv file

In [22]:
file_path = 'Report2.csv'
report2.to_csv(file_path , index = True)