In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv('SeoulBikeData.csv')

In [4]:
df[df["Snowfall (cm)"] == 0.].count()

Date                        8317
Rented Bike Count           8317
Hour                        8317
Temperature(C)              8317
Humidity(%)                 8317
Wind speed (m/s)            8317
Visibility (10m)            8317
Dew point temperature(C)    8317
Solar Radiation (MJ/m2)     8317
Rainfall(mm)                8317
Snowfall (cm)               8317
Seasons                     8317
Holiday                     8317
Functioning Day             8317
dtype: int64

In [5]:
print("Number of datapoints with Winter Season: ", len(df[df["Seasons"] == "Winter"]))
print("Number of datapoints with Spring Season: ", len(df[df["Seasons"] == "Spring"]))
print("Number of datapoints with Summer Season: ", len(df[df["Seasons"] == "Summer"]))
print("Number of datapoints with Autumn Season: ", len(df[df["Seasons"] == "Autumn"]))

Number of datapoints with Winter Season:  2160
Number of datapoints with Spring Season:  2208
Number of datapoints with Summer Season:  2208
Number of datapoints with Autumn Season:  2184


In [6]:
print("Number of columns with non zero Snowfall: ", len(df)-len(df[df["Snowfall (cm)"] == 0.]))
print("Number of columns with zero Snowfall: ", len(df[df["Snowfall (cm)"] == 0.]))


Number of columns with non zero Snowfall:  443
Number of columns with zero Snowfall:  8317


In [7]:
df_winter=df[df["Seasons"] == "Winter"]

In [8]:
print("Number of datapoints with non zero snowfall in Winter Season: ", len(df_winter[df_winter["Snowfall (cm)"] != 0.]))
print("Number of datapoints with zero snowfall in Winter Season: ", len(df_winter[df_winter["Snowfall (cm)"] == 0.]))

Number of datapoints with non zero snowfall in Winter Season:  392
Number of datapoints with zero snowfall in Winter Season:  1768


In [9]:
# Calculate mean of Rented Bike Count for Winter Season with non zero snowfall
df_nonzero= df_winter[df_winter["Snowfall (cm)"] != 0.]
df_zero= df_winter[df_winter["Snowfall (cm)"] == 0.]

mean_nonzero = df_nonzero["Rented Bike Count"].mean()
mean_zero = df_zero["Rented Bike Count"].mean()

print("Mean of Rented Bike Count for Winter Season with non zero snowfall: ", mean_nonzero)
print("Mean of Rented Bike Count for Winter Season with zero snowfall: ", mean_zero)

Mean of Rented Bike Count for Winter Season with non zero snowfall:  157.30357142857142
Mean of Rented Bike Count for Winter Season with zero snowfall:  240.670814479638


In [10]:
# Calculate Estimate of variance of Rented Bike Count for Winter Season with non zero snowfall
var_nonzero = df_nonzero["Rented Bike Count"].var()
var_zero = df_zero["Rented Bike Count"].var()

print("Estimate of variance of Rented Bike Count for Winter Season with non zero snowfall: ", var_nonzero)
print("Estimate of variance of Rented Bike Count for Winter Season with zero snowfall: ", var_zero)


Estimate of variance of Rented Bike Count for Winter Season with non zero snowfall:  11934.508631713556
Estimate of variance of Rented Bike Count for Winter Season with zero snowfall:  23725.24075496726


In [11]:
# Do previous calculation using formula
n_nonzero = len(df_nonzero)
n_zero = len(df_zero)

mean_nonzero_formula = df_nonzero["Rented Bike Count"].sum()/n_nonzero
mean_zero_formula = df_zero["Rented Bike Count"].sum()/n_zero

print("Mean of Rented Bike Count for Winter Season with non zero snowfall using formula: ", mean_nonzero_formula)
print("Mean of Rented Bike Count for Winter Season with zero snowfall using formula: ", mean_zero_formula)

Mean of Rented Bike Count for Winter Season with non zero snowfall using formula:  157.30357142857142
Mean of Rented Bike Count for Winter Season with zero snowfall using formula:  240.670814479638


In [12]:
var_nonzero_formula = (1/(n_nonzero-1))*((df_nonzero["Rented Bike Count"]-mean_nonzero)**2).sum()
var_zero_formula = (1/(n_zero-1))*((df_zero["Rented Bike Count"]-mean_zero)**2).sum()

print("Estimate of variance of Rented Bike Count for Winter Season with non zero snowfall using formula: ", var_nonzero_formula)
print("Estimate of variance of Rented Bike Count for Winter Season with zero snowfall using formula: ", var_zero_formula)

Estimate of variance of Rented Bike Count for Winter Season with non zero snowfall using formula:  11934.508631713556
Estimate of variance of Rented Bike Count for Winter Season with zero snowfall using formula:  23725.24075496726


In [13]:
# Calculate the Value of t-statistic
import math
t_statistic = (mean_nonzero - mean_zero)/math.sqrt(var_nonzero/n_nonzero + var_zero/n_zero)
print("Value of t-statistic: ", t_statistic)

Value of t-statistic:  -12.587491780039079


In [14]:
# Print the value of Z_alpha where alpha = 0.05 and Z is standard normal random variable

from scipy.stats import norm
Z_alpha = norm.ppf(1-0.05)
print("Value of Z_alpha: ", Z_alpha)


Value of Z_alpha:  1.6448536269514722


In [15]:
from scipy.stats import norm

# Define the value of T
T = -12.58  # Example value for alpha

# Calculate P(Z < alpha)
probability = norm.cdf(T)

# Print the result of p-value
print("P(Z < {}):".format(T), probability)


P(Z < -12.58): 1.3601493975098363e-36


# Results using Library

In [16]:
# Verify the above results using inbuilt python library
from scipy.stats import ttest_ind
t_statistic, p_value = ttest_ind(df_nonzero["Rented Bike Count"], df_zero["Rented Bike Count"])
print("Value of t-statistic: ", t_statistic)
print("p-value: ", p_value)


Value of t-statistic:  -10.163358641541603
p-value:  9.799394129697895e-24


In [17]:
# Print the p-value
print("p-value: ", p_value)


p-value:  9.799394129697895e-24


In [18]:
# Results are close but same