In [116]:
import pandas as pd

dfpopulation = pd.read_csv("2008.csv")

# Converting On-Time columns to binary (1 = True, i.e. yes on time)
dfpopulation["DepartOnTime"] = (dfpopulation.DepDelay < 0)*1
dfpopulation["ArrivalOnTime"] = (dfpopulation.ArrDelay < 0)*1

df1 = dfpopulation.sample(frac = 0.00001) 
df2 = dfpopulation.sample(frac = 0.00001) 

Using one (or more) subsamples of the dataframe of flights, generate a meaningful example for each of the following tests and procedures:

1. Chi-squared test
2. T-test for independent samples
3. T-test for paired data
4. ANOVA (compare multiple means)
5. Confidence interval for the mean
6. Test if the data is normally distributed (one of the main assumptions, most of the times is not satisfied...)
7. (EXTRA) What is bootstrap?!

In [63]:
df1.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'DepartOnTime', 'ArrivalOnTime'],
      dtype='object')

# Chi-squared test

In [68]:
import scipy.stats as stats
import numpy as np

In [130]:
# Example: On-time arrival rates
# Suppose expected value for on-time arrival rates = population mean of on-time arrival rates
# Is there a significant difference in on-time arrival rates between carriers when compared
# to population mean on-time arrival?
# (Assuming a=0.05 signficance)

sample = df1.ArrivalOnTime.groupby(df1.UniqueCarrier).mean()
expected = np.ones(len(sample))*dfpopulation.ArrivalOnTime.mean()

alpha = 0.05
dof = 2-1 # 2 outcomes: on time or not on time arrival

critical_value = stats.chi2.ppf(1-alpha, df = dof)
chisq, p = stats.chisquare(sample, f_exp = expected, ddof = dof)

print("H0: There is no significant difference between carriers' on-time arrival rates.")
print("H1: There is a significant difference between carriers' on-time arrival rates.")
print("Alpha:", alpha)
print("DOF:", dof)
print("Critical value:", critical_value)
print("Test statistic:", chisq)

if chisq < critical_value:
    print("With 95% confidence, there is not significant evidence to reject H0. Therefore, accept H0.")
elif chisq > critical_value:
    print("With 95% confidence, there is significant evidence to reject H0. Therefore, accept H1.")


H0: There is no significant difference between carriers' on-time arrival rates.
H1: There is a significant difference between carriers' on-time arrival rates.
Alpha: 0.05
DOF: 1
Critical value: 3.841458820694124
Test statistic: 4.012760572005583
With 95% confidence, there is significant evidence to reject H0. Therefore, accept H1.


# T-test for independent samples

In [None]:
# Is there a significant difference between 

# Paired T-test

In [136]:
# Is there a sig. diff. between on-time departure and on-time arrival rates for each carrier?

sample = pd.DataFrame(df1.DepartOnTime.groupby(df1.UniqueCarrier).mean())
sample["ArrivalOnTime"] = df1.ArrivalOnTime.groupby(df1.UniqueCarrier).mean()
sample["Difference"] = sample.DepartOnTime - sample.ArrivalOnTime

sample.head(50)

Unnamed: 0_level_0,DepartOnTime,ArrivalOnTime,Difference
UniqueCarrier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA,0.0,0.333333,-0.333333
AS,0.0,0.0,0.0
B6,1.0,1.0,0.0
CO,0.666667,0.444444,0.222222
DL,0.5,0.5,0.0
EV,0.0,0.0,0.0
F9,0.0,0.333333,-0.333333
FL,1.0,1.0,0.0
HA,1.0,1.0,0.0
MQ,0.333333,0.333333,0.0
