# Exploratory Analysis of Boris Bike trip data

**Import Libraries and Data and do quick inspection**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sns.set_style("darkgrid")
%pylab inline

In [None]:
data = pd.read_hdf('ride_data.hdf', 'df', encoding = 'utf-8')
data.head()

In [None]:
print data.dtypes
print data.shape

**Filter out some records with obviously oncorrect dates and set index on start_time**

In [None]:
data = data[data.start_time >= '2010-01-01']
data = data[data.end_time >= '2010-01-01']
data = data.set_index('start_time')
data = data.sort_index()

**Function to help plotting**

In [None]:
def plot_rides(type = 'd', start = '2012-01-01', end = '2017-01-01', title = None):
    
    grouping = {'d':'Day', 'h':'Hour', 'hm':'Hour / Minute', 'dh':'Day / Hour', 'dhm':'Day / Hour / Minute'}
    
    plt.figure(figsize=(15, 5))
    plot_data = data.ix[start:end]
    if type == 'd':
        plot_data.groupby(plot_data.index.date).count()['rental_id'].plot()
    elif type == 'h':
        plot_data.groupby(plot_data.index.hour).count()['rental_id'].plot()
    elif type == 'hm':
        plot_data.groupby([plot_data.index.hour, plot_data.index.minute]).count()['rental_id'].plot()
    elif type == 'dh':
        plot_data.groupby([plot_data.index.date, plot_data.index.hour]).count()['rental_id'].plot()
    elif type == 'dhm':
        plot_data.groupby([plot_data.index.date, plot_data.index.hour, plot_data.index.minute]).count()['rental_id'].plot()
    else:
        print 'incorrect grouping passed!'
    if title == None:
        title = 'Rides by ' + grouping[type] + ' ' + start + ' to ' + end 
        plt.suptitle(title, fontsize=20)
    else:
        plt.suptitle(title, fontsize=20)

In [None]:
plot_rides(title = 'Rides by Day')

Find the biggest spike. Turns out that it's not bad data but the day of a [Tube Strike](http://www.standard.co.uk/news/london/boris-bike-use-at-twice-normal-level-as-londoners-seek-to-avoid-the-tube-strike-10378279.html)

In [None]:
by_date = data.groupby(data.index.date).count()['rental_id']
by_date[by_date == by_date.max()]

**Look at some of the low values, below are the 20 lowest recorded**

In [None]:
by_date.sort_values().head(n = 20)

The 25th of August 2014 sticks out as a bit of an obvious day to investigate further given that most of the others seem to be around Christmas whereas over the Summer period, you would expect usage to be more consistent.

In [None]:
plot_rides('hm', '2014-08-25','2014-08-25')

This pattern is clearly different to the prior week's data both in shape and magnitude! A quick check of the calendar reveals the reason - [Bank Holiday Monday](https://en.wikipedia.org/wiki/Bank_holiday)

In [None]:
plot_rides('hm', '2014-08-18', '2014-08-18')

**Add some station data**

In [None]:
q = pd.DataFrame(data.reset_index().groupby(['start_id', 'end_id']).count()['rental_id']).reset_index()

In [None]:
q.groupby(['first', 'last']).sum().sort('rental_id', ascending = 0)

In [None]:
stations = pd.read_csv('stations.csv')

In [None]:
data.groupby('start_id').count().sort('rental_id', ascending = 0)

In [None]:
data.to_csv('ride_data.csv', sep=',', index=False)