## Imports

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools
from IPython.display import Image
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import datetime
import networkx as nx
from os.path import join

%matplotlib inline
pd.set_option("display.precision", 2)

## Count all unique bicycles in each month 

In [24]:
paths = [
    r'./data',
]
all_bikes = []
bikes_per_month = {}
for path in paths:
    all_files = sorted(glob.glob(path + "/*.csv"))
    month = 3
    for idx, filename in enumerate(all_files):
        data = pd.read_csv(filename)
        unique_bikes = data.bike_number.unique()
        bikes_per_month[month] = unique_bikes.size
        all_bikes = [*all_bikes, *unique_bikes]
        print("Unique bicycles in month", month,":", unique_bikes.size)
        month += 1
        
all_unique_bikes = set(all_bikes)
print("Unique bicycles in 2019 :",len(all_unique_bikes))
bikes_per_month

Unique bicycles in month 3 : 1027
Unique bicycles in month 4 : 1027
Unique bicycles in month 5 : 1399
Unique bicycles in month 6 : 1965
Unique bicycles in month 7 : 1909
Unique bicycles in month 8 : 1914
Unique bicycles in month 9 : 1940
Unique bicycles in month 10 : 1909
Unique bicycles in month 11 : 1886
Unique bicycles in month 12 : 839
Unique bicycles in 2019 : 2322


{3: 1027,
 4: 1027,
 5: 1399,
 6: 1965,
 7: 1909,
 8: 1914,
 9: 1940,
 10: 1909,
 11: 1886,
 12: 839}

## Read files and change stations names to numbers

In [25]:
df_names = pd.read_csv('./networks/nodes.csv', usecols=['value', 'name'])
df = pd.read_csv('./networks/nodes_locations.csv', usecols=['name', 'lng', 'lat'])
df_edges = pd.read_csv('./plik.csv', usecols=['interval_start','interval_end','number_of_trips','rental_place','return_place'])
dict_names_temp = df_names['name'].to_dict()

In [26]:
dict_names_temp
dict_names = {}
for value, name in dict_names_temp.items():
    dict_names[name] = value
df_edges["rental_place"].replace(dict_names, inplace=True)
df_edges["return_place"].replace(dict_names, inplace=True)

df_edges

Unnamed: 0,interval_end,interval_start,number_of_trips,rental_place,return_place
0,2019-06-30 15:45:00,2019-06-30 15:30:00,1,98,155
1,2019-06-30 16:00:00,2019-06-30 15:45:00,1,98,155
2,2019-06-30 16:15:00,2019-06-30 16:00:00,1,98,155
3,2019-06-30 16:30:00,2019-06-30 16:15:00,1,98,155
4,2019-06-30 16:45:00,2019-06-30 16:30:00,1,98,155
...,...,...,...,...,...
483328,2019-08-01 00:00:00,2019-07-31 23:45:00,1,162,62
483329,2019-08-01 00:00:00,2019-07-31 23:45:00,1,173,61
483330,2019-08-01 00:00:00,2019-07-31 23:45:00,1,191,177
483331,2019-08-01 00:00:00,2019-07-31 23:45:00,1,197,197


## Get start and end intervals

In [27]:
df_edges['interval_start']= pd.to_datetime(df_edges['interval_start']) 
df_edges['interval_end']= pd.to_datetime(df_edges['interval_end'])

In [28]:
start = df_edges.interval_start.min()
start = start.replace(hour=0, minute=0, second=0)
start

Timestamp('2019-06-30 00:00:00')

In [29]:
end = df_edges.interval_end.max()
end = end.replace(hour=0, minute=0, second=0)
end

Timestamp('2019-08-01 00:00:00')

In [30]:
ranges = pd.date_range(start, end,freq='15T')
ranges

DatetimeIndex(['2019-06-30 00:00:00', '2019-06-30 00:15:00',
               '2019-06-30 00:30:00', '2019-06-30 00:45:00',
               '2019-06-30 01:00:00', '2019-06-30 01:15:00',
               '2019-06-30 01:30:00', '2019-06-30 01:45:00',
               '2019-06-30 02:00:00', '2019-06-30 02:15:00',
               ...
               '2019-07-31 21:45:00', '2019-07-31 22:00:00',
               '2019-07-31 22:15:00', '2019-07-31 22:30:00',
               '2019-07-31 22:45:00', '2019-07-31 23:00:00',
               '2019-07-31 23:15:00', '2019-07-31 23:30:00',
               '2019-07-31 23:45:00', '2019-08-01 00:00:00'],
              dtype='datetime64[ns]', length=3073, freq='15T')

## Testing for interval_start 2019-03-31 23:45:00

In [296]:
pd.DataFrame(df_edges[df_edges["interval_start"] == '2019-03-31 23:45:00'])
interval_edges = pd.DataFrame(df_edges[df_edges["interval_start"] == '2019-03-31 23:45:00'])
G = nx.DiGraph()
for index, node in df.iterrows():
    G.add_node(index, name=node.name, lat=node.lat, lng=node.lng)
for index, edge in interval_edges.iterrows():
    G.add_edge(edge.rental_place, edge.return_place, weight=edge.number_of_trips)
nodes_degrees = G.degree(weight='weight')
nodes_in_degrees = G.in_degree(weight='weight')
nodes_out_degrees = G.out_degree(weight='weight')

nodes_pageranks = nx.pagerank(G, weight='weight')
nodes_info = [dict(nodes_degrees).values(), dict(nodes_in_degrees).values(), dict(nodes_out_degrees).values(), nodes_pageranks.values()]
metrics_interval_df = pd.DataFrame({'node': list(G.nodes),
                      'degree': list(dict(nodes_degrees).values()),
                      'in_degree': list(dict(nodes_in_degrees).values()),
                      'out_degree': list(dict(nodes_out_degrees).values()),
                      'pagerank': list(nodes_pageranks.values())},
                    )
metrics_interval_df['interval_start'] = '2019-03-31 23:45:00'
metrics_interval_df['interval_end'] = '2019-04-01 00:00:00'

pd.set_option('display.max_rows', metrics_interval_df.shape[0]+1)
metrics_interval_df.style.hide_index()

bikes_in_use =  sum(metrics_interval_df.in_degree)
bikes_total = bikes_per_month[ranges[0].month]
bikes_percentage = bikes_in_use/bikes_total
bikes_usage_interval_df = pd.DataFrame({'interval_start': '2019-03-31 23:45:00',
                               'interval_end': '2019-04-01 00:00:00',
                               'bikes_in_use': bikes_in_use,
                               'bikes_total': bikes_total,
                               'bikes_percentage': bikes_percentage}, index = [1])
bikes_usage_interval_df
metrics_interval_df = metrics_interval_df[metrics_interval_df['degree'] > 0]

In [297]:
metrics_interval_df

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
1,1,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
5,5,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
16,16,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
26,26,1,1,0,0.00753,2019-03-31 23:45:00,2019-04-01 00:00:00
61,61,1,1,0,0.00753,2019-03-31 23:45:00,2019-04-01 00:00:00
67,67,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
68,68,1,1,0,0.00753,2019-03-31 23:45:00,2019-04-01 00:00:00
88,88,1,1,0,0.0058,2019-03-31 23:45:00,2019-04-01 00:00:00
108,108,2,1,1,0.0271,2019-03-31 23:45:00,2019-04-01 00:00:00
109,109,2,1,1,0.0271,2019-03-31 23:45:00,2019-04-01 00:00:00


## Function for counting metrics and bikes_usage for each interval in month

In [31]:
def count_metrics(df, df_edges):
    bikes_usage_df = pd.DataFrame(columns = ['interval_start', 'interval_end', 'bikes_in_use', 'bikes_total', 'bikes_percentage'])
    metrics_df = pd.DataFrame(columns = ['node', 'degree', 'in_degree', 'out_degree', 'pagerank', 'interval_start', 'interval_end'])
    i = 0
    for interval in ranges:

        interval_end = interval + datetime.timedelta(minutes=15)

        if interval_end > end:
            break

        interval_edges = pd.DataFrame(df_edges[df_edges["interval_start"] == interval])
        G = nx.DiGraph()
        for index, node in df.iterrows():
            G.add_node(index, name=node.name, lat=node.lat, lng=node.lng)
        for index, edge in interval_edges.iterrows():
            G.add_edge(edge.rental_place, edge.return_place, weight=edge.number_of_trips)
        nodes_degrees = G.degree(weight='weight')
        nodes_in_degrees = G.in_degree(weight='weight')
        nodes_out_degrees = G.out_degree(weight='weight')

        nodes_pageranks = nx.pagerank(G, weight='weight')
        nodes_info = [dict(nodes_degrees).values(), dict(nodes_in_degrees).values(), dict(nodes_out_degrees).values(), nodes_pageranks.values()]
        metrics_interval_df = pd.DataFrame({'node': list(G.nodes),
                              'degree': list(dict(nodes_degrees).values()),
                              'in_degree': list(dict(nodes_in_degrees).values()),
                              'out_degree': list(dict(nodes_out_degrees).values()),
                              'pagerank': list(nodes_pageranks.values())},
                            )


        metrics_interval_df['interval_start'] = interval
        metrics_interval_df['interval_end'] = interval_end
        pd.set_option('display.max_rows', metrics_interval_df.shape[0]+1)

        bikes_in_use =  sum(metrics_interval_df.in_degree)
        bikes_total = bikes_per_month[interval.month]
        bikes_percentage = bikes_in_use/bikes_total
        bikes_usage_interval_df = pd.DataFrame({'interval_start': interval,
                                       'interval_end': interval_end,
                                       'bikes_in_use': bikes_in_use,
                                       'bikes_total': bikes_total,
                                       'bikes_percentage': bikes_percentage}, index = [i])
        i += 1
        bikes_usage_df = bikes_usage_df.append(bikes_usage_interval_df)
        metrics_interval_df = metrics_interval_df[metrics_interval_df['degree'] > 0]
        metrics_df = metrics_df.append(metrics_interval_df)
        
    return bikes_usage_df, metrics_df

In [32]:
bikes_usage, metrics = count_metrics(df, df_edges)

In [33]:
bikes_usage


Unnamed: 0,interval_start,interval_end,bikes_in_use,bikes_total,bikes_percentage
0,2019-06-30 00:00:00,2019-06-30 00:15:00,0,1965,0.00
1,2019-06-30 00:15:00,2019-06-30 00:30:00,0,1965,0.00
2,2019-06-30 00:30:00,2019-06-30 00:45:00,0,1965,0.00
3,2019-06-30 00:45:00,2019-06-30 01:00:00,0,1965,0.00
4,2019-06-30 01:00:00,2019-06-30 01:15:00,0,1965,0.00
...,...,...,...,...,...
3067,2019-07-31 22:45:00,2019-07-31 23:00:00,152,1909,0.08
3068,2019-07-31 23:00:00,2019-07-31 23:15:00,123,1909,0.06
3069,2019-07-31 23:15:00,2019-07-31 23:30:00,100,1909,0.05
3070,2019-07-31 23:30:00,2019-07-31 23:45:00,99,1909,0.05


In [34]:
metrics

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
98,98,1,0,1,4.86e-03,2019-06-30 15:30:00,2019-06-30 15:45:00
155,155,1,1,0,8.99e-03,2019-06-30 15:30:00,2019-06-30 15:45:00
98,98,1,0,1,4.86e-03,2019-06-30 15:45:00,2019-06-30 16:00:00
155,155,1,1,0,8.99e-03,2019-06-30 15:45:00,2019-06-30 16:00:00
98,98,1,0,1,4.86e-03,2019-06-30 16:00:00,2019-06-30 16:15:00
...,...,...,...,...,...,...,...
177,177,1,1,0,6.74e-03,2019-07-31 23:45:00,2019-08-01 00:00:00
188,188,2,2,0,7.95e-03,2019-07-31 23:45:00,2019-08-01 00:00:00
191,191,1,0,1,3.64e-03,2019-07-31 23:45:00,2019-08-01 00:00:00
197,197,2,1,1,2.42e-02,2019-07-31 23:45:00,2019-08-01 00:00:00


## Read metrics and bikes usage from existing files 

In [2]:
metrics = pd.read_csv('./metrics/historia_przejazdow_2019-03.csv_metrics.csv', usecols=['node', 'degree', 'in_degree', 'out_degree', 'pagerank', 'interval_start', 'interval_end'])
bikes_usage = pd.read_csv('./metrics/historia_przejazdow_2019-03.csv_bikes_usage.csv', usecols=['interval_start', 'interval_end', 'bikes_in_use', 'bikes_total', 'bikes_percentage'])


In [3]:
metrics

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
0,18,2,1,1,3.12e-02,2019-03-11 21:15:00,2019-03-11 21:30:00
1,18,2,1,1,3.12e-02,2019-03-11 21:30:00,2019-03-11 21:45:00
2,18,2,1,1,3.12e-02,2019-03-11 21:45:00,2019-03-11 22:00:00
3,18,2,1,1,3.12e-02,2019-03-11 22:00:00,2019-03-11 22:15:00
4,196,8,4,4,3.12e-02,2019-03-15 10:15:00,2019-03-15 10:30:00
...,...,...,...,...,...,...,...
43783,164,1,1,0,7.71e-03,2019-03-31 23:45:00,2019-04-01 00:00:00
43784,168,2,1,1,2.77e-02,2019-03-31 23:45:00,2019-04-01 00:00:00
43785,170,2,0,2,4.17e-03,2019-03-31 23:45:00,2019-04-01 00:00:00
43786,171,1,0,1,4.17e-03,2019-03-31 23:45:00,2019-04-01 00:00:00


In [4]:
print("Max pagerank:",metrics['pagerank'].max())
print("Min pagerank:",metrics['pagerank'].min())
print("Mean pagerank:",metrics['pagerank'].mean())

Max pagerank: 0.22353303465602728
Min pagerank: 0.0013985493563694765
Mean pagerank: 0.008841609519391532


In [5]:
pd.DataFrame(metrics.iloc[metrics['pagerank'].idxmax()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
1308,75,14,12,2,0.22,2019-03-23 12:45:00,2019-03-23 13:00:00


In [6]:
pd.DataFrame(metrics.iloc[metrics['pagerank'].idxmin()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
35131,21,1,0,1,0.0014,2019-03-30 14:45:00,2019-03-30 15:00:00


In [7]:
print("Max degree:",metrics['degree'].max())
print("Min degree:",metrics['degree'].min())
print("Mean degree:",metrics['degree'].mean())

Max degree: 83
Min degree: 1
Mean degree: 4.178770439389787


In [8]:
pd.DataFrame(metrics.iloc[metrics['degree'].idxmax()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
36068,168,83,31,52,0.024,2019-03-30 17:00:00,2019-03-30 17:15:00


In [9]:
pd.DataFrame(metrics.iloc[metrics['degree'].idxmin()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
9,58,1,1,0,0.0084,2019-03-15 11:00:00,2019-03-15 11:15:00


In [10]:
print("Max in_degree:",metrics['in_degree'].max())
print("Min in_degree:",metrics['in_degree'].min())
print("Mean in_degree:",metrics['in_degree'].mean())

Max in_degree: 54
Min in_degree: 0
Mean in_degree: 2.0893852196948934


In [11]:
pd.DataFrame(metrics.iloc[metrics['in_degree'].idxmax()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
34812,168,70,54,16,0.059,2019-03-30 13:45:00,2019-03-30 14:00:00


In [12]:
pd.DataFrame(metrics.iloc[metrics['in_degree'].idxmin()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
10,18,2,0,2,0.0046,2019-03-15 11:15:00,2019-03-15 11:30:00


In [13]:
print("Max out_degree:",metrics['out_degree'].max())
print("Min out_degree:",metrics['out_degree'].min())
print("Mean out_degree:",metrics['out_degree'].mean())

Max out_degree: 52
Min out_degree: 0
Mean out_degree: 2.0893852196948934


In [14]:
pd.DataFrame(metrics.iloc[metrics['out_degree'].idxmax()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
36068,168,83,31,52,0.024,2019-03-30 17:00:00,2019-03-30 17:15:00


In [15]:
pd.DataFrame(metrics.iloc[metrics['out_degree'].idxmin()]).transpose()

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
5,18,2,2,0,0.008,2019-03-15 10:30:00,2019-03-15 10:45:00


In [16]:
bikes_usage

Unnamed: 0,interval_start,interval_end,bikes_in_use,bikes_total,bikes_percentage
0,2019-03-11 00:00:00,2019-03-11 00:15:00,0,1027,0.00
1,2019-03-11 00:15:00,2019-03-11 00:30:00,0,1027,0.00
2,2019-03-11 00:30:00,2019-03-11 00:45:00,0,1027,0.00
3,2019-03-11 00:45:00,2019-03-11 01:00:00,0,1027,0.00
4,2019-03-11 01:00:00,2019-03-11 01:15:00,0,1027,0.00
...,...,...,...,...,...
2011,2019-03-31 22:45:00,2019-03-31 23:00:00,31,1027,0.03
2012,2019-03-31 23:00:00,2019-03-31 23:15:00,35,1027,0.03
2013,2019-03-31 23:15:00,2019-03-31 23:30:00,35,1027,0.03
2014,2019-03-31 23:30:00,2019-03-31 23:45:00,25,1027,0.02


In [17]:
print("Total amount of bikes for month:",bikes_usage['bikes_total'].max())
print("Max amount of bikes in use:",bikes_usage['bikes_in_use'].max())
print("Min amount of bikes in use:",bikes_usage['bikes_in_use'].min())
print("Mean amount of bikes in use:",bikes_usage['bikes_in_use'].mean())
print("Max bikes % usage:",bikes_usage['bikes_percentage'].max())
print("Min bikes % usage:",bikes_usage['bikes_percentage'].min())
print("Mean bikes % usage:",bikes_usage['bikes_percentage'].mean())

Total amount of bikes for month: 1027
Max amount of bikes in use: 583
Min amount of bikes in use: 0
Mean amount of bikes in use: 45.38194444444444
Max bikes % usage: 0.5676728334956183
Min bikes % usage: 0.0
Mean bikes % usage: 0.044188845612896224


In [18]:
pd.DataFrame(bikes_usage.iloc[bikes_usage['bikes_in_use'].idxmax()]).transpose()

Unnamed: 0,interval_start,interval_end,bikes_in_use,bikes_total,bikes_percentage
1893,2019-03-30 17:15:00,2019-03-30 17:30:00,583,1027,0.57


In [59]:
#metrics.to_csv(join("metrics_test_07.csv"), index=False)

In [38]:
#bikes_usage.to_csv(join("bikes_usage_test_07.csv"), index=False)