# Ford's GoBike: Data Exploration

TODO Introduction

## Table of Contents

TODO Table of Contents

## Environment

In [None]:
from datetime import date
import folium
from folium import Marker
from folium.plugins import MarkerCluster
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import requests
import seaborn as sns

In [None]:
default_viz_palette = 'YlOrBr'
default_viz_colour = sns.color_palette(default_viz_palette)[-3]
sns.set_context('notebook', font_scale=1.25)
sns.despine()
%matplotlib inline

## Preliminary Wrangling

### Gathering

In [None]:
data_url = "https://video.udacity-data.com/topher/2020/October/5f91cf38_201902-fordgobike-tripdata/201902-fordgobike-tripdata.csv"
data_http_response = requests.get(data_url)
data_http_response

In [None]:
data_path = "..//data//"

In [None]:
with open(os.path.join(data_path, "201902-fordgobike-tripdata.csv"), "w") as data_file:
    data_file.write(data_http_response.text)

In [None]:
df = pd.read_csv(os.path.join(data_path, "201902-fordgobike-tripdata.csv"))

### Assessing

<div class="alert alert-info">
    <ul>
        <li><code>member_birth_year</code> should be <b>integer</b>.</li>
        <li><code>member_birth_year</code> also has some "extreme" values (e.g. 1878).</li>
        <li><code>user_type</code> and <code>member_gender</code> should be <b>categorical</b>.</li>
        <li><code>bike_share_for_all_trip</code> could be <b>boolean</b>.</li>
        <li><b>Missing values</b> in <code>member_birth_year</code>, <code>member_gender</code>, <code>start_station_name</code>, and <code>end_station_name</code>.</li>
        <li><code>start_station_id</code>, <code>end_station_id</code>, and <code>bike_id</code> will not be used.</li>
    </ul>
</div>

#### General

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sum(df.duplicated())

#### Specific Columns

##### Start and End Times

In [None]:
df.start_time.min(), df.start_time.max()

In [None]:
df.end_time.min(), df.end_time.max()

##### Station Names

In [None]:
df.start_station_name.isnull().sum()

In [None]:
df.end_station_name.isnull().sum()

##### User Type

In [None]:
df.user_type.value_counts()

In [None]:
df.user_type.isnull().sum()

##### Birth Years

In [None]:
df.member_birth_year.sort_values().unique()

In [None]:
df.member_birth_year.isnull().sum()

##### Genders

In [None]:
df.member_gender.value_counts()

In [None]:
df.member_gender.isnull().sum()

##### Bike for entire trip

In [None]:
df.bike_share_for_all_trip.value_counts()

### Cleaning

In [None]:
df_clean = df.copy()

#### Missing data

##### Define
Since our analysis will focus on understanding GoBike members' profiles and behaviour, we'll drop the rows without information on their birth year and gender.

As per station names, given the fact we've already got the starting and ending coordinates in the dataset, the absence of names doesn't have a significant impact on our work. Therefore, we're not treating these cases here.

##### Code

In [None]:
df_clean = df_clean.dropna(subset=['member_birth_year', 'member_gender']).reset_index(drop=True)

##### Test

In [None]:
df_clean.info()

#### Dropping unused columns

##### Define

As mentioned in the assessing step, ID columns won't be used. We'll just drop them from the DataFrame.

##### Code

In [None]:
df_clean.drop(columns=['start_station_id', 'end_station_id', 'bike_id'], inplace=True)

##### Test

In [None]:
df_clean.info()

#### Wrong data types

##### Define

- Start and end dates are to be converted into `datetime`;
- Birth years will become `int`;
- User type and gender will be converted into unordered `CategoricalDtype`;
- Bike used for entire trip must be `bool`.

##### Code

###### `datetime` 

In [None]:
# start and end times must be datetime
datetime_vars = ['start_time', 'end_time']

In [None]:
for var in datetime_vars:
    df_clean[var] = pd.to_datetime(df_clean[var])

###### `int` 

In [None]:
# member's birth years should be integer, instead of float
df_clean['member_birth_year'] = df_clean['member_birth_year'].astype(int)

###### `CategoricalDtype`

In [None]:
# user type, member gender and bike shared for all trip should be unordered categorical variables
unordered_cat_vars = {
    'user_type': list(df_clean.user_type.unique()),
    'member_gender': list(df_clean.member_gender.unique())
}

In [None]:
for var in unordered_cat_vars:
    unordered_var = pd.api.types.CategoricalDtype(ordered = False, categories = unordered_cat_vars[var])
    df_clean[var] = df_clean[var].astype(unordered_var)

###### `bool`

In [None]:
df_clean['bike_share_for_all_trip'] = df_clean['bike_share_for_all_trip'].apply(lambda x: True if x == "Yes" else False)

##### Test

In [None]:
df_clean.info()

#### Filtering birth years

##### Define
We're defining members' generations according to [these criterium](https://www.beresfordresearch.com/age-range-by-generation/), so only members born in 1922 and forward will remain.

##### Code

In [None]:
df_clean = df_clean[df_clean.member_birth_year >= 1922]

##### Test

In [None]:
df_clean.member_birth_year.min()

### Feature Engineering

#### Converting duration to minutes

##### Define
We'll calculate the number of whole minutes every trip lasted, so the trip duration is more readable and interpretable.

##### Code

In [None]:
def convert_sec_to_min(duration_sec):
    minutes, seconds = divmod(duration_sec, 60)
    return minutes

In [None]:
df_clean['duration_min'] = df_clean.duration_sec.apply(convert_sec_to_min)

In [None]:
df_clean.drop(columns=['duration_sec'], inplace=True)

##### Test

In [None]:
df_clean.head()

#### Extracting more info on the dates

##### Define

We're extracting time of the day and weekday for each start and end time.

##### Code

###### Start Time

In [None]:
start_parts = {
    'start_time': df_clean.start_time,
    'start_date': pd.to_datetime(df_clean.start_time.dt.date),
    'start_hour': df_clean.start_time.dt.hour,
    'start_weekday': df_clean.start_time.dt.day_name()
}

In [None]:
start_df = pd.DataFrame(start_parts)

In [None]:
start_df.head()

In [None]:
df_clean = df_clean.merge(start_df, how='left', on='start_time')

###### End time

In [None]:
end_parts = {
    'end_time': df_clean.end_time,
    'end_date': pd.to_datetime(df_clean.end_time.dt.date),
    'end_hour': df_clean.end_time.dt.hour,
    'end_weekday': df_clean.end_time.dt.day_name()
}

In [None]:
end_df = pd.DataFrame(end_parts)

In [None]:
end_df.head()

In [None]:
df_clean = df_clean.merge(end_df, how='left', on='end_time')

###### Weekdays

In [None]:
weekdays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
weekday_var = pd.api.types.CategoricalDtype(ordered=True, categories=weekdays)

df_clean['start_weekday'] = df_clean['start_weekday'].astype(weekday_var)
df_clean['end_weekday'] = df_clean['end_weekday'].astype(weekday_var)

##### Test

In [None]:
df_clean.head()

In [None]:
df_clean.info()

#### Getting people's generations

##### Define
First, we create bins for each generation based on birth years. After that, we'll create a categorical variable and assign each member their generation according to the bin they belong.

##### Code

In [None]:
gen_labels = ['WW II', 'Post War', 'Boomers I', 'Boomers II', 'Gen X', 'Millenials', 'Gen Z']
gen_birth_years = [1922, 1927, 1945, 1954, 1964, 1980, 1996, 2012]

In [None]:
df_clean['member_generation'] = pd.cut(df_clean['member_birth_year'], gen_birth_years, labels=gen_labels, include_lowest=True, right=True)

In [None]:
member_gen_var = pd.api.types.CategoricalDtype(ordered=True, categories=gen_labels)
df_clean['member_generation'] = df_clean['member_generation'].astype(member_gen_var)

##### Test

In [None]:
df_clean.head()

In [None]:
for gen in df_clean.member_generation.cat.categories:
    years = df_clean.query("member_generation == @gen")['member_birth_year'].sort_values().unique()
    print(gen, years)

#### Getting users' ages

##### Define

Members' estimated age will be calculated in relation to the current year.

##### Code

In [None]:
df_clean['member_age'] = date.today().year - df_clean['member_birth_year']

##### Test

In [None]:
df_clean.head()

### More on the Dataset

In [None]:
df_clean.info()

#### Structure

The clean version of our dataset has **175,164 trips with 19 features**, as seen above.

We've got a pretty fair balance in the amount of **categorical** and **numerical** variables.

#### Main Features of Interest

This exploration aims to find the best variables for predicting the **trip duration**.

For now, we may expect weekdays, members' ages, and time of day to have considerable effect on trip duration.

## Univariate Exploration

### Trip Duration

In [None]:
df_clean.duration_min.describe(percentiles=[.25, .5, .75, .99])

<div class="alert alert-success">
    <b>About 99%</b> of the trips <b>lasted less than an hour</b>. We'll focus our analyses on these.
</div>

In [None]:
# define our figure
plt.figure(figsize=(15, 10))

# bins
dur_bins = np.arange(0, 60, 1)

# plotting
sns.histplot(df_clean, x='duration_min', bins=dur_bins, color=default_viz_colour)

# final touches
plt.xlabel('Trip Duration (min)')
plt.ylabel('# of Trips')
plt.show()

Since this variable's distribution is positively skewed, let's try a logarithmic scale, so we're able to better visualise any trends.

In [None]:
np.log10(df_clean.duration_min).describe()

In [None]:
dur_binsize_log = 0.1
dur_bins_log = 10 ** np.arange(0, np.log10(df_clean.duration_min.max()) + dur_binsize_log, dur_binsize_log)

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
p = sns.histplot(df_clean, x='duration_min', bins=dur_bins_log, color=default_viz_colour)

# scaling
p.set_xscale('log')

# x-labels
p_xticks = [1, 3, 5, 10, 15, 20, 30, 45, 60]
p.set_xticks(p_xticks)
p.set_xticklabels(p_xticks)

# finishing touches
p.set_xlabel('Trip Duration (min)')
p.set_ylabel('# of Trips')
plt.show()

<div class="alert alert-success">
    <ul>
        <li>An <b>average trip</b> lasts <b>11 minutes</b>.</li>
        <li>Most of the trips last <b>between 4 and 15 minutes</b>, hinting at the public's preference for trips shorter in time, and possibly in distance. Unfortunately, we have no information on the distance travelled to do such investigation.</li>
    </ul>
</div>

#### Longer Trips

Let's take a deeper look into trips that last more than an hour.

In [None]:
df_longer = df_clean.query("duration_min > 60").reset_index(drop=True)

In [None]:
df_longer.shape[0]

In [None]:
df_longer.describe(percentiles=[.25, .5, .75, .90, .95, .99])

In [None]:
df_longer[df_longer.start_date != df_longer.end_date]['start_date'].count()

<div class="alert alert-success">
    <ul>
        <li><b>Longer trips</b> last 193 minutes (or <b>3 hours and 13 minutes</b>) on <b>average</b>. However, it's important to notice the <i>huge standard deviation and value range</i>, going from 61 minutes to about 1400 minutes.</li>
        <li><b>75%</b> of these trips last <b>between 1 and 3 hours</b>.</li>
        <li><b>Most</b> of them <b>start and end in the same day</b>.</li>
    </ul>
</div>

### Weekday and Time of Day

In [None]:
# defining our figure
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(30, 20))

# plotting
sns.countplot(data=df_clean, x='start_weekday', ax=ax[0, 0], color=default_viz_colour)
sns.countplot(data=df_clean, x='start_hour', ax=ax[0, 1], color=default_viz_colour)
sns.countplot(data=df_clean, x='end_weekday', ax=ax[1, 0], color=default_viz_colour)
sns.countplot(data=df_clean, x='end_hour', ax=ax[1, 1], color=default_viz_colour)

# titles
ax[0, 0].set_title('Start Weekday')
ax[0, 1].set_title('Start Hour')
ax[1, 0].set_title('End Weekday')
ax[1, 1].set_title('End Hour')

# labels
fig.suptitle("Trips per Weekday and Time of Day", fontsize=20)
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

<div class="alert alert-success">
    <ul>
        <li>The public seems to prefer bike trips on <b>Tuesdays</b> and <b>Thursdays</b>.</li>
        <li><b>Most trips</b> happen on <b>workdays, between 8 am and 6 pm</b>, with the highest numbers of bikers starting and ending their trips during rush hours, indicating the clients prefer to use GoBike services for their daily commutes.</li>
    </ul>
</div>

#### Longer Trips

In [None]:
# defining our figure
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(30, 20))

# plotting
sns.countplot(data=df_longer, x='start_weekday', ax=ax[0, 0], color=default_viz_colour)
sns.countplot(data=df_longer, x='start_hour', ax=ax[0, 1], color=default_viz_colour)
sns.countplot(data=df_longer, x='end_weekday', ax=ax[1, 0], color=default_viz_colour)
sns.countplot(data=df_longer, x='end_hour', ax=ax[1, 1], color=default_viz_colour)

# titles
ax[0, 0].set_title('Start Weekday')
ax[0, 1].set_title('Start Hour')
ax[1, 0].set_title('End Weekday')
ax[1, 1].set_title('End Hour')

# labels
fig.suptitle("Longer Trips per Weekday and Time of Day", fontsize=20)
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

<div class="alert alert-success">
    <b>Most longer trips</b> happen on <b>weekends</b>, during the <b>afternoon</b>.
</div>

### Generations

In [None]:
# defining our figure
fig, ax = plt.subplots(ncols=2, figsize=(30, 10))

# plotting
sns.countplot(data=df_clean, x='member_generation', ax=ax[0], color=default_viz_colour)
sns.countplot(data=df_longer, x='member_generation', ax=ax[1], color=default_viz_colour)

# titles
fig.suptitle("Trips by Generation", fontsize=20)
ax[0].set_title('All Trips')
ax[1].set_title('Longer Trips (1+ Hour)')

# labels
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

<div class="alert alert-success">
    The vast majority of trips were made by <b>Millenials</b>, born between 1984 and 1996.
</div>

### Gender

In [None]:
# defining our figure
fig, ax = plt.subplots(ncols=2, figsize=(30, 10))

# plotting
sns.countplot(data=df_clean, x='member_gender', ax=ax[0], color=default_viz_colour, order=df_clean.member_gender.value_counts().index)
sns.countplot(data=df_longer, x='member_gender', ax=ax[1], color=default_viz_colour, order=df_longer.member_gender.value_counts().index)

# titles
fig.suptitle("Trips by Gender", fontsize=20)
ax[0].set_title('All Trips')
ax[1].set_title('Longer Trips (1+ Hour)')

# labels
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

In [None]:
df_clean.member_gender.value_counts(normalize=True)

<div class="alert alert-success">
    <b>74%</b> the trips were made by clients who identify as <b>male</b>, <b>23%</b> were by females, and only <b>2%</b> of the trips were of clients of other genders.
</div>

### User Type

In [None]:
# defining our figure
fig, ax = plt.subplots(ncols=2, figsize=(30, 10))

# plotting
sns.countplot(data=df_clean, x='user_type', ax=ax[0], color=default_viz_colour, order=df_clean.user_type.value_counts().index)
sns.countplot(data=df_longer, x='user_type', ax=ax[1], color=default_viz_colour, order=df_longer.user_type.value_counts().index)

# titles
fig.suptitle("Trips by User Type", fontsize=20)
ax[0].set_title('All Trips')
ax[1].set_title('Longer Trips (1+ Hour)')

# labels
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

<div class="alert alert-success">
    <ul>
        <li>We can see there are <b>way more subscribers</b> than customers.</li>
        <li>For <b>longer trips</b>, there's a <b>balance between customers and subscribers</b>.</li>
    </ul>
</div>

### Age

In [None]:
# defining our figure
fig, ax = plt.subplots(ncols=2, figsize=(30, 10))

# bins
age_bins = np.arange(0, 100, 1)

# plotting
sns.histplot(data=df_clean, x='member_age', ax=ax[0], bins=age_bins, color=default_viz_colour)
sns.histplot(data=df_longer, x='member_age', ax=ax[1], bins=age_bins, color=default_viz_colour)

# titles
fig.suptitle("Trips by Users' Age", fontsize=20)
ax[0].set_title('All Trips')
ax[1].set_title('Longer Trips (1+ Hour)')

# labels
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

In [None]:
df_clean.member_age.describe()

In [None]:
df_longer.member_age.describe()

Once again, we have a variable with positively skewed distribution. As we've done in previous analyses, we'll change to a logarithmic scale.

In [None]:
np.log10(df_clean.member_age).describe()

In [None]:
age_binsize_log = 0.03

In [None]:
age_overall_bins_log = 10 ** np.arange(1.32, np.log10(df_clean.member_age.max()) + age_binsize_log, age_binsize_log)

In [None]:
np.log10(df_longer.member_age).describe()

In [None]:
age_longer_bins_log = 10 ** np.arange(1.32, np.log10(df_longer.member_age.max()) + age_binsize_log, age_binsize_log)

In [None]:
# defining our figure
fig, ax = plt.subplots(ncols=2, figsize=(30, 10))

# plotting
sns.histplot(data=df_clean, x='member_age', ax=ax[0], bins=age_overall_bins_log, color=default_viz_colour)
sns.histplot(data=df_longer, x='member_age', ax=ax[1], bins=age_longer_bins_log, color=default_viz_colour)

# scaling
ax[0].set_xscale('log')
ax[1].set_xscale('log')

# x-labels
ax_xticks = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80]

ax[0].set_xticks(ax_xticks)
ax[0].set_xticklabels(ax_xticks)
ax[1].set_xticks(ax_xticks)
ax[1].set_xticklabels(ax_xticks)

# titles
fig.suptitle("Trips by Users' Age (Logarithmic Scale)", fontsize=20)
ax[0].set_title('All Trips')
ax[1].set_title('Longer Trips (1+ Hour)')

# labels
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

<div class="alert alert-success">
    <ul>
        <li>The <b>average</b> age is <b>37 years old</b>.</li>
        <li>Also in both scenarios, <b>75%</b> of the trips belong to people <b>under 45 years old</b>.</li>
    </ul>
</div>

### Stations

In [None]:
top5_start_stations = df_clean.start_station_name.value_counts()[:5]
top5_end_stations = df_clean.end_station_name.value_counts()[:5]

In [None]:
# defining our figure
fig, ax = plt.subplots(nrows=2, figsize=(15, 20))

# plotting
sns.barplot(x=top5_start_stations.values, y=top5_start_stations.index, ax=ax[0], color=default_viz_colour)
sns.barplot(x=top5_end_stations.values, y=top5_end_stations.index, ax=ax[1], color=default_viz_colour)

# titles
ax[0].set_title('Start Stations')
ax[1].set_title('End Stations')

# labels
plt.setp(ax, xlabel=None, ylabel=None)

# finishing touches
plt.show()

<div class="alert alert-success">
    <ul>
        <li>Out of the 5 most popular start stations, <b>three of them are public transport stations</b>.</li>
        <li>Two of the top end stations, <b>Market St and Ferry Building</b>, are <b>famous San Francisco tourist attractions.</li>
    </ul>
</div>

### Start and End Coordinates
This map is interactive! If you're seeing the HTML version of this Notebook, you can drag this map and click on each bubble to expand the starting points.

In [None]:
# creating a basic map
sf_map = folium.Map(location=[37.595917, -122.193127], tiles='cartodbpositron', zoom_start=10)

# adding start station points to the map
sf_markers = MarkerCluster()

for idx, point in df_clean.iterrows():
    sf_markers.add_child(Marker([point['start_station_latitude'], point['start_station_longitude']]))

sf_map.add_child(sf_markers)

# showing the map
sf_map

<div class="alert alert-success">
    <ul>
        <li>All of our trips spread across <b>San Francisco</b>, <b>Berkeley</b>, <b>Oakland</b>, <b>San Jose</b>, and other areas nearby.</li>
        <li>The <b>vast majority</b> of the trips happened in <b>San Francisco</b>, followed by <b>Oakland</b> and <b>Berkeley</b>.</li>
    </ul>
</div>

## Bivariate Exploration

In [None]:
numeric_vars = ['duration_min', 'start_hour', 'end_hour', 'member_age']
categorical_vars = ['user_type', 'member_generation', 'member_gender', 'bike_share_for_all_trip', 'start_weekday', 'end_weekday']

### Numeric vs Numeric

We can use dates to find correlations with other numeric variables as well. To do that, we must convert dates to `numeric` data type, as below:

In [None]:
df_clean['start_date_num'] = pd.to_numeric(df_clean['start_date'])
df_clean['end_date_num'] = pd.to_numeric(df_clean['end_date'])

In [None]:
numeric_vars.append('start_date_num')
numeric_vars.append('end_date_num')

Checking if it worked:

In [None]:
df_clean.head()

In [None]:
numeric_vars

#### Plot Matrix

We'll use a 500-row sample to simplify our plot.

In [None]:
num_sample_idx = np.random.choice(a=df_clean.shape[0], size=500, replace=False)
num_sample = df_clean.loc[num_sample_idx, :]

In [None]:
# plotting
g = sns.PairGrid(data=num_sample, vars=numeric_vars)
g = g.map_diag(sns.histplot, bins=30, color=default_viz_colour)
g.map_offdiag(sns.scatterplot, color=default_viz_colour)

# finishing touches
plt.show()

#### Correlation Plot

In [None]:
# defining our figure
plt.figure(figsize=(10, 8))

# plotting
sns.heatmap(data=df_clean[numeric_vars].corr(), annot=True, center=0, cmap=default_viz_palette)

# finishing touches
plt.show()

<div class="alert alert-success">
    By the two visualisations above, the only strong correlation found is between <b>start and end hours</b>, which is completely expected. The other combinations didn't show any significant correlation.
</div>

### Categorical vs Numeric
Over the next few plots, we're working with bigger samples (3k rows), since they're visually simpler.

We'll also get longer trips and investigate them separately, as done in our multivariate exploration.

In [None]:
df_one_hour = df_clean.query("duration_min <= 60").reset_index(drop=True)

In [None]:
cat_general_sample_idx = np.random.choice(a=df_one_hour.shape[0], size=3000, replace=False)
cat_general_sample = df_one_hour.loc[cat_general_sample_idx, :]

In [None]:
cat_long_sample_idx = np.random.choice(a=df_longer.shape[0], size=500, replace=False)
cat_long_sample = df_longer.loc[cat_long_sample_idx, :]

#### **Trip Duration and Start Hour** vs **Gender, Generation, and User Type**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 30))

# plotting
g = sns.PairGrid(data=cat_general_sample, y_vars=['duration_min', 'start_hour'], x_vars=['member_gender', 'member_generation', 'user_type'], height=10)
g.map(sns.violinplot, color=default_viz_colour, cut=0)

# finishing touches
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 30))

# plotting
g = sns.PairGrid(data=cat_long_sample, y_vars=['duration_min', 'start_hour'], x_vars=['member_gender', 'member_generation', 'user_type'], height=10)
g.map(sns.violinplot, color=default_viz_colour, cut=0)

# finishing touches
plt.show()

<div class="alert alert-success">
    blablabla
</div>

#### **Weekday** vs **Start Hour and Trip Duration**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
g = sns.PairGrid(data=cat_general_sample, y_vars=['duration_min', 'start_hour'], x_vars=['start_weekday'], height=10)
g.map(sns.violinplot, color=default_viz_colour, cut=0)

# finishing touches
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
g = sns.PairGrid(data=cat_long_sample, y_vars=['duration_min', 'start_hour'], x_vars=['start_weekday'], height=10)
g.map(sns.violinplot, color=default_viz_colour, cut=0)

# finishing touches
plt.show()

<div class="alert alert-success">
    blablabla
</div>

### Categorical vs Categorical

#### **User Type** vs **Weekdays**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='start_weekday', hue='user_type', palette=default_viz_palette)

# finishing touches
plt.title("Trips per Starting Weekday")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="User Type")
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_longer, x='start_weekday', hue='user_type', palette=default_viz_palette)

# finishing touches
plt.title("Longer Trips per Starting Weekday")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="User Type")
plt.show()

<div class="alert alert-success">
    blablabla
</div>

#### **Generations** vs **Weekdays**

##### Up to 1 Hour

In [None]:
# plotting w faceting
grid = sns.FacetGrid(data=df_clean, col='member_generation', sharex=False, sharey=False, col_wrap=3, height=8)
grid.map(sns.countplot, 'start_weekday', palette=default_viz_palette)

# finishing touches
grid.set_axis_labels("", "")
grid.fig.subplots_adjust(top=0.95)
grid.fig.suptitle("Generations vs Weekdays")
plt.show()

##### More than 1 Hour

In [None]:
# plotting w faceting
grid = sns.FacetGrid(data=df_longer, col='member_generation', sharex=False, sharey=False, col_wrap=3, height=8)
grid.map(sns.countplot, 'start_weekday', palette=default_viz_palette)

# finishing touches
grid.set_axis_labels("", "")
grid.fig.subplots_adjust(top=0.95)
grid.fig.suptitle("Generations vs Weekdays (Longer Trips)")
plt.show()

<div class="alert alert-success">
    blablabla
</div>

#### **User Type** vs **Generation**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_generation', hue='user_type', palette=default_viz_palette)

# finishing touches
plt.title("Trips per Generation and User Type")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="User Type")
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_generation', hue='user_type', palette=default_viz_palette)

# finishing touches
plt.title("Trips per Generation and User Type")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="User Type")
plt.show()

<div class="alert alert-success">
    blablabla
</div>

#### **User Type** vs **Gender**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_gender', hue='user_type', palette=default_viz_palette)

# finishing touches
plt.title("Trips per Gender and User Type")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="User Type")
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_gender', hue='user_type', palette=default_viz_palette)

# finishing touches
plt.title("Trips per Gender and User Type")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="User Type")
plt.show()

<div class="alert alert-success">
    blablabla
</div>

#### **Generation** vs **Gender**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_generation', hue='member_gender', palette=default_viz_palette)

# finishing touches
plt.title("Trips per Gender and Generation")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="Gender")
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_generation', hue='member_gender', palette=default_viz_palette)

# finishing touches
plt.title("Trips per Gender and Generation")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="Gender")
plt.show()

<div class="alert alert-success">
    blablabla
</div>

#### **Generation** vs **Bike for Entire Trip**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_generation', hue='bike_share_for_all_trip', palette=default_viz_palette)

# finishing touches
plt.title("Trips according to Generation and Bike Usage")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="Bike for entire trip")
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='member_generation', hue='bike_share_for_all_trip', palette=default_viz_palette)

# finishing touches
plt.title("Trips according to Generation and Bike Usage (Longer Trips)")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="Bike for entire trip")
plt.show()

<div class="alert alert-success">
    blablabla
</div>

#### **User Type** vs **Bike for Entire Trip**

##### Up to 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_clean, x='user_type', hue='bike_share_for_all_trip', palette=default_viz_palette)

# annotations
for p in fig.patches:
    fig.annotate(f"{p.get_height()}", (p.get_x() + 0.15, p.get_height() + 150))

# finishing touches
plt.title("Trips according to User Type and Bike Usage")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="Bike for entire trip")
plt.show()

##### More than 1 Hour

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
fig = sns.countplot(data=df_longer, x='user_type', hue='bike_share_for_all_trip', palette=default_viz_palette)

# annotations
for p in fig.patches:
    fig.annotate(f"{p.get_height()}", (p.get_x() + 0.15, p.get_height() + 150))

# finishing touches
plt.title("Trips according to User Type and Bike Usage (Longer Trips)")
plt.setp(fig, xlabel=None, ylabel=None)
plt.legend(title="Bike for entire trip")
plt.show()

<div class="alert alert-success">
    blablabla
</div>

## Multivariate Exploration

### **Generation** vs **Weekday** vs **Trip Duration**

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
ax = sns.pointplot(data=df_clean, x='start_weekday', y='duration_min', hue='member_generation', dodge=0.5, linestyles='', palette='RdPu')

# finishing touches
plt.title("Trip Duration across Generations and Weekdays")
plt.xlabel(None)
plt.yticks([5, 10, 15, 20])
plt.ylabel("Trip Duration (min)")
plt.legend(title="Generation")

plt.show()

<div class="alert alert-success">
    blablabla
</div>

### **Age** vs **Gender** vs **Trip Duration** (TODO)

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
ax = sns.pointplot(data=df_clean, x='member_generation', y='duration_min', hue='member_gender', dodge=0.5, linestyles='', palette='RdPu')

# finishing touches
plt.title("Trip Duration across Ages and Genders")
plt.xlabel(None)
# plt.yticks([5, 10, 15, 20])
plt.ylabel("Trip Duration (min)")
plt.legend(title="Gender")

plt.show()

<div class="alert alert-success">
    blablabla
</div>

### **Gender** vs **Weekday** vs **Trip Duration**

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
ax = sns.pointplot(data=df_clean, x='start_weekday', y='duration_min', hue='member_gender', dodge=0.5, linestyles='', palette='RdPu')

# finishing touches
plt.title("Trip Duration across Weekdays and Genders")
plt.xlabel(None)
# plt.yticks([5, 10, 15, 20])
plt.ylabel("Trip Duration (min)")
plt.legend(title="Gender")

plt.show()

<div class="alert alert-success">
    blablabla
</div>

### **Generation** vs **Weekday** vs **Start Hour**

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
ax = sns.pointplot(data=df_clean, x='start_weekday', y='start_hour', hue='member_generation', dodge=0.5, linestyles='', palette='RdPu')

# finishing touches
plt.title("Start Hour across Generations and Weekdays")
plt.xlabel(None)
plt.ylabel("Start Hour")
plt.legend(title="Generation")

plt.show()

<div class="alert alert-success">
    blablabla
</div>

### **Age** vs **Gender** vs **Start Hour** (TODO)

In [None]:
# defining our figure
plt.figure(figsize=(15, 10))

# plotting
ax = sns.pointplot(data=df_clean, x='member_generation', y='start_hour', hue='member_gender', dodge=0.5, linestyles='', palette=default_viz_palette)

# finishing touches
plt.title("Start Hour across Ages and Genders")
plt.xlabel(None)
plt.ylabel("Start Hour")
plt.legend(title="Gender")

plt.show()

<div class="alert alert-success">
    blablabla
</div>

## References

https://stackoverflow.com/questions/48242555/correllation-pandas-between-date-and-integer-timeseries