In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display

cwd = os.getcwd()

In [2]:
# read in preaggregated csv file
df_agg = pd.read_csv(os.path.join(cwd,
                          "Citibike_Aggregate",
                          "GroupBy_Year_Month_Day_Hour_User_Gender_Seconds_Distance.csv"))

df_agg.head()

Unnamed: 0,Start Year,Start Month,Start Day,Start Hour,User Type,Gender,Duration_Seconds_SUM,Duration_Seconds_COUNT,Station_Distance_SUM,Station_Distance_COUNT
0,2017,1,1,0,Customer,0,188340,114,221436,114
1,2017,1,1,0,Customer,1,1080,2,2056,2
2,2017,1,1,0,Customer,2,0,0,1,0
3,2017,1,1,0,Subscriber,0,1416,6,3246,6
4,2017,1,1,0,Subscriber,1,346888,450,836554,450


In [3]:
df_agg_gender = df_agg.drop(['Start Year','Start Month','Start Day','Start Hour', 'User Type','Duration_Seconds_COUNT','Station_Distance_SUM','Station_Distance_COUNT'], axis = 1)

df_agg_gender

Unnamed: 0,Gender,Duration_Seconds_SUM
0,0,188340
1,1,1080
2,2,0
3,0,1416
4,1,346888
...,...,...
160699,1,212516
160700,2,19283
160701,0,7550
160702,1,182871


In [4]:
## Import the packages
from scipy import stats
import random

#Sampling
#Women's duration seconds list

df_agg_women = df_agg_gender[(df_agg_gender.Gender == 1)]

women_bike_duration_list_all = df_agg_women['Duration_Seconds_SUM'].to_list()

women_bike_duration_list = random.sample(women_bike_duration_list_all, 1000)

#print(women_bike_duration_list)

#Men's duration seconds list

df_agg_men = df_agg_gender[(df_agg_gender.Gender == 2)]

men_bike_duration_list_all = df_agg_men['Duration_Seconds_SUM'].to_list()

men_bike_duration_list = random.sample(men_bike_duration_list_all, 1000)

#print(men_bike_duration_list)

In [5]:
# calculate means and standard deviations

mean_women = np.mean(women_bike_duration_list)
mean_men = np.mean(men_bike_duration_list)

#print(mean_women, mean_men)

print({"women mean":mean_women,"men mean":mean_men})

{'women mean': 577285.368, 'men mean': 241169.241}


In [6]:
#calculate standard deviations
std_women = np.std(women_bike_duration_list)
std_men = np.std(men_bike_duration_list)

print({"women standard deviation":std_women, "men standard deviation":std_men})

{'women standard deviation': 908245.6403714948, 'men standard deviation': 406969.1055755767}


In [7]:
# calculate standard errors

n1 = len(women_bike_duration_list)
n2 = len(men_bike_duration_list)

se_women = std_women/np.sqrt(n1)
se_men = std_men/np.sqrt(n2)

print({"women standard error":se_women, "men standard error":se_men})

{'women standard error': 28721.24898492102, 'men standard error': 12869.49310940353}


In [8]:
# standard error on the difference between the samples
sed = np.sqrt(se_women**2.0 + se_men**2.0)

sed

31472.750056943096

In [9]:
# calculate the t statistic
t_stat = (mean_women - mean_men) / sed

print({"t value for duration seconds":t_stat})

{'t value for duration seconds': 10.679591913381289}


In [10]:
#station distance:

df_agg_gender_dis = df_agg.drop(['Start Year','Start Month','Start Day','Start Hour', 'User Type','Duration_Seconds_COUNT','Duration_Seconds_SUM','Station_Distance_COUNT'], axis = 1)

df_agg_gender_dis.head()

Unnamed: 0,Gender,Station_Distance_SUM
0,0,221436
1,1,2056
2,2,1
3,0,3246
4,1,836554


In [11]:
## Import the packages
from scipy import stats

#Sampling
#Women's station distance list

df_agg_women_dis = df_agg_gender_dis[(df_agg_gender.Gender == 1)]

women_station_dis_list_all = df_agg_women_dis['Station_Distance_SUM'].to_list()

women_station_dis_list = random.sample(women_station_dis_list_all,1000)

#print(women_station_dis_list)

#Women's station distance list

df_agg_men_dis = df_agg_gender_dis[(df_agg_gender.Gender == 2)]

men_station_dis_list_all = df_agg_men_dis['Station_Distance_SUM'].to_list()

men_station_dis_list = random.sample(men_station_dis_list_all,1000)

#print(men_station_dis_list)

In [12]:
# calculate means and standard deviations

mean_women_dis = np.mean(women_station_dis_list)
mean_men_dis = np.mean(men_station_dis_list)


print({"women mean dis":mean_women_dis,"men mean dis":mean_men_dis})

{'women mean dis': 1181226.148, 'men mean dis': 438983.049}


In [13]:
#calculate standard deviations
std_women_dis = np.std(women_station_dis_list)
std_men_dis = np.std(men_station_dis_list)

print({"women standard deviation dis":std_women_dis, "men standard deviation":std_men_dis})

{'women standard deviation dis': 1865327.2285212688, 'men standard deviation': 746323.0893064709}


In [14]:
# calculate standard errors

n3 = len(women_station_dis_list)
n4 = len(men_station_dis_list)

se_women_dis = std_women_dis/np.sqrt(n3)
se_men_dis = std_men_dis/np.sqrt(n4)

print({"women standard error dis":se_women_dis, "men standard error dis":se_men_dis})

{'women standard error dis': 58986.826236566056, 'men standard error dis': 23600.808325817034}


In [15]:
# standard error on the difference between the samples
sed_dis = np.sqrt(se_women_dis**2.0 + se_men_dis**2.0)

sed_dis

63533.01364719599

In [16]:
# calculate the t statistic
t_stat_dis = (mean_women_dis - mean_men_dis) / sed_dis

t_stat_dis

print({"t value for station distance":t_stat_dis})

{'t value for station distance': 11.682793816797933}


In [17]:
#Conclusion

#Null Hypothesis: The data for duration time periods and distances between stations have the same distribution regardless of the gender. 
#Alternative Hypothesis: The data for duration time periods and distances between stations have different distributions for women and men.

#The null hypothesis is rejcted for both datasets because both T values are much larger than 0.
