In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import pingouin as pg

In [None]:
preprocessed_data = pd.read_csv('preprocessed_data.csv')
preprocessed_data.head()

Unnamed: 0.1,Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,0,2011-01-01,winter,2011,1,0,0,Saturday,0,clear,0.24,0.2879,81.0,0.0,3,13,16
1,1,2011-01-01,winter,2011,1,1,0,Saturday,0,clear,0.22,0.2727,80.0,0.0,8,32,40
2,2,2011-01-01,winter,2011,1,2,0,Saturday,0,clear,0.22,0.2727,80.0,0.0,5,27,32
3,3,2011-01-01,winter,2011,1,3,0,Saturday,0,clear,0.24,0.2879,75.0,0.0,3,10,13
4,4,2011-01-01,winter,2011,1,4,0,Saturday,0,clear,0.24,0.2879,75.0,0.0,0,1,1


#### Now It's time for hypothesis testing.
1. Registered rides are more than Casual rides.  (reject the null hypothesis)
2. Clear weather tend to have more rides than any other weather type.  (fail to reject the null hypothesis)
3. Summer season tend to have more rides than any other season.  (fail to reject the null hypothesis)
4. Working days tend to have more registered rides than Weekend days.  (fail to reject the null hypothesis)
5. Weekend days tend to have more Casaual Rides than Working Days.  (fail to reject the null hypothesis)
6. There is a huge Rigestered peek at hours(7, 8, 9)A.M and (4, 5, 6, 7)P.M (reject the null hypothesis)

### Registered rides are more than Casual rides.
- H_0 : Casual >= Registered
- H_A : Registered > Casual

In [None]:
# Define registered and casual rides0 per day.
registered = preprocessed_data['registered']
casual = preprocessed_data['casual']

statistic, p_value = stats.ttest_ind(registered, casual, alternative='greater')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : 97.813, p-value: 0.000


### Clear weather tend to have more rides than any other weather type.
- H_0 : other_weather_sit >= clear_weather_sit
- H_A : clear_weather_sit > other_weather_sit

In [None]:
pg.pairwise_tests(data=preprocessed_data,\
                        dv="cnt",\
                        between="weathersit",\
                        padjust="bonf",
                        alternative='greater')

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,hedges
0,weathersit,clear,cloudy,False,True,9.809678,9497.095005,greater,6.548711e-23,3.929226e-22,bonf,2.553e+19,0.162343
1,weathersit,clear,heavy_rain_snow,False,True,2.899183,2.006222,greater,0.05043296,0.3025978,bonf,17.067,0.688893
2,weathersit,clear,light_rain_snow,False,True,23.500386,2196.637449,greater,1.994713e-109,1.196828e-108,bonf,8.986e+115,0.506538
3,weathersit,cloudy,heavy_rain_snow,False,True,2.23788,2.01192,greater,0.07694951,0.461697,bonf,4.801,0.609513
4,weathersit,cloudy,light_rain_snow,False,True,14.729664,2889.903723,greater,1.042842e-47,6.257055e-47,bonf,9.411e+44,0.401184
5,weathersit,heavy_rain_snow,light_rain_snow,False,True,-0.825302,2.025002,greater,0.7524774,1.0,bonf,0.872,-0.278392


### Summer season tend to have more rides than any other season.
- H_0 : other_seasons >= summer
- H_A : summer > other_seasons

In [None]:
pg.pairwise_tests(data=preprocessed_data,\
                        dv="cnt",\
                        between="season",\
                        padjust="bonf",
                        alternative='greater')

Unnamed: 0,Contrast,A,B,Paired,Parametric,T,dof,alternative,p-unc,p-corr,p-adjust,BF10,hedges
0,season,fall,spring,False,True,-2.371935,8637.772532,greater,0.9911415,1.0,bonf,0.804,-0.051009
1,season,fall,summer,False,True,-9.116126,8723.490771,greater,1.0,1.0,bonf,2.429e-17,-0.194773
2,season,fall,winter,False,True,26.150058,7272.755861,greater,1.9211510000000002e-144,1.152691e-143,bonf,1.965e+141,0.568365
3,season,spring,summer,False,True,-6.763074,8895.579871,greater,1.0,1.0,bonf,2.693e-09,-0.143264
4,season,spring,winter,False,True,28.799248,7492.917888,greater,2.243429e-173,1.346057e-172,bonf,2.753e+170,0.614233
5,season,summer,winter,False,True,35.988241,7453.97039,greater,5.849364e-262,3.509619e-261,bonf,4.183e+260,0.759869


### Working days tend to have more registered rides than Weekend days.
- H_0 : weekend_days_registered >= working_days_registered
- H_A : working_days_registered > weekend_days_registered

In [None]:
working_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
condition = preprocessed_data['weekday'].isin(working_days)

working_days_data = preprocessed_data[condition].registered
weekend_days_data = preprocessed_data[~condition].registered

statistic, p_value = stats.ttest_ind(working_days_data, weekend_days_data, alternative='greater')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : 16.004, p-value: 0.000


### Weekend days tend to have more Casaual Rides than Working Days.
- H_0 : working_days_casual >= weekend_days_casual
- H_A : weekend_days_casual > working_days_casual

In [None]:
weekend_days = ['Saturday', 'Sunday']
condition = preprocessed_data['weekday'].isin(weekend_days)
weekend_days_data = preprocessed_data[condition].casual
working_days_data = preprocessed_data[~condition].casual

statistic, p_value = stats.ttest_ind(weekend_days_data, working_days_data, alternative='greater')


print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : 41.077, p-value: 0.000


- There is a huge N.O Registered rides Peek at  hours (7, 8, 9)AM and (4, 5, 6, 7)PM in the working days
    - H_0 : other_hours >= peek_hours
    - H_A : peek_hours > other_hours

In [None]:
peek_hours = [7, 8, 9, 16, 17, 18, 19]

condition = preprocessed_data['hr'].isin(peek_hours)
peek_hours_rides = preprocessed_data[condition].registered
other_hours_rides = preprocessed_data[~condition].registered.sample(n=peek_hours_rides.shape[0])

statistic, p_value = stats.ttest_ind(peek_hours, other_hours_rides, alternative='less')

print(f'statistic is : {statistic:.003f}, p-value: {p_value:.003f}')

statistic is : -2.492, p-value: 0.006


### Testing Correlations Strength and Direction, Using the **scipy.stats.linregress** function. 
    - turned out that all correlations are truly exit. 
1. temp has a moderate Positive Correlation with Both (Registered, Casual) rides.
2. atemp has a moderate Positive Correlation with Both (Registered, Casual) rides.
3. hum has a moderate Negative Correlation with Both (Registered, Casual) rides.
4. windspeed has a week Positive Correlation with Both (Registered, Casual) rides.

In [None]:
from scipy.stats import linregress

def correlation_test(cols):
    for col in cols:
        reg_res = linregress(x=preprocessed_data[col], y=preprocessed_data['registered'])

        print(f'Testing Correlation between {col} and Registered and Casual Columns respectively')
        print(f'Slope: {reg_res.slope:.003f}, Pearson correlation-value: {reg_res.rvalue:.003f}, P-value: {reg_res.pvalue:.003f}')

        cas_res = linregress(x=preprocessed_data[col], y=preprocessed_data['casual'])
        print(f'Slope: {cas_res.slope:.003f}, Pearson correlation-value: {cas_res.rvalue:.003f}, P-value: {cas_res.pvalue:.003f} \n\n')

- H_0 : there is no correlation for each column and the targets (registered, casual)
- H_A : there is a correlation for each column between columns and the targets (registered, casual)

In [None]:
cols = ['temp', 'atemp', 'hum', 'windspeed']

correlation_test(cols)

Testing Correlation between temp and Registered and Casual Columns respectively
Slope: 263.608, Pearson correlation-value: 0.335, P-value: 0.000
Slope: 117.687, Pearson correlation-value: 0.460, P-value: 0.000 


Testing Correlation between atemp and Registered and Casual Columns respectively
Slope: 292.901, Pearson correlation-value: 0.333, P-value: 0.000
Slope: 130.279, Pearson correlation-value: 0.454, P-value: 0.000 


Testing Correlation between hum and Registered and Casual Columns respectively
Slope: -2.149, Pearson correlation-value: -0.274, P-value: 0.000
Slope: -0.887, Pearson correlation-value: -0.347, P-value: 0.000 


Testing Correlation between windspeed and Registered and Casual Columns respectively
Slope: 1.520, Pearson correlation-value: 0.082, P-value: 0.000
Slope: 0.543, Pearson correlation-value: 0.090, P-value: 0.000 


