# Statistics
In this notebook we will consider an introduction to statistics in Python.
## Hypothesis testing

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
df = pd.read_csv("utility_files/grades.csv")
print(df.head())

In [None]:
# we will divide the dataset into two categories: those submitted the first assignment before the end of December: early finishers 
# and those who finished a time after

# first let's rename the dictionary for easier manipulation later on
rename_dic = {}

for i in range(1, 7):
    rename_dic["assignment"+str(i)+"_submission"] = "ass"+str(i)+"s"
    rename_dic["assignment"+str(i)+"_grade"] = "ass"+str(i)+"g"

rename_dic["student_id"] = "id"
df = df.rename(columns=rename_dic)
print(df.head())

In [12]:
# let's start dividing
early_finishers = df[pd.to_datetime(df["ass1s"]) < '2016']
late_finishers = df[pd.to_datetime(df['ass1s']) >= '2016']
print(early_finishers.shape)
print(late_finishers.shape)
late_finishers_2 = df[~ df.index.isin(early_finishers.index)]
print(late_finishers_2.equals(late_finishers))

(1259, 13)
(1056, 13)
True


In [14]:
# consider the mean grade of the first category students
print(early_finishers['ass1g'].mean())
print(late_finishers['ass1g'].mean())

74.94728457024304
74.0450648477065


In [15]:
# the question is the following: do the two sets have the same distribution ? we can approach this question using student t-test 
# the null hypothesis: the two random variables: early_finishers and late_finishers are the same
# Now we need to define a treshhold that reflects how much chance we are willing to accept when drawing conclusions

from scipy.stats import ttest_ind

# let's run the test with the 1st assignment's results
ttest_ind(early_finishers['ass1g'], late_finishers['ass1g'])


Ttest_indResult(statistic=1.3223540853721596, pvalue=0.18618101101713855)

In [None]:
# the value p: is our confidence that the null hypothesis is wrong (well quite a simplified explanation).
# if pvalue is less than alpha, then we can say that there is enough statistical difference to reject the null hypothesis which is our case
# the two random variables are different
