## Imports

In [274]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools
from IPython.display import Image
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

from os.path import join

%matplotlib inline
pd.set_option("display.precision", 2)

## Count all unique bicycles in each month 

In [144]:
paths = [
    r'./data',
]
all_bikes = []
bikes_per_month = {}
for path in paths:
    all_files = sorted(glob.glob(path + "/*.csv"))
    month = 3
    for idx, filename in enumerate(all_files):
        data = pd.read_csv(filename)
        unique_bikes = data.bike_number.unique()
        bikes_per_month[month] = unique_bikes.size
        all_bikes = [*all_bikes, *unique_bikes]
        print("Unique bicycles in month", month,":", unique_bikes.size)
        month += 1
        
all_unique_bikes = set(all_bikes)
print("Unique bicycles in 2019 :",len(all_unique_bikes))
bikes_per_month

Unique bicycles in month 3 : 1027
Unique bicycles in month 4 : 1027
Unique bicycles in month 5 : 1399
Unique bicycles in month 6 : 1965
Unique bicycles in month 7 : 1909
Unique bicycles in month 8 : 1914
Unique bicycles in month 9 : 1940
Unique bicycles in month 10 : 1909
Unique bicycles in month 11 : 1886
Unique bicycles in month 12 : 839
Unique bicycles in 2019 : 2322


{3: 1027,
 4: 1027,
 5: 1399,
 6: 1965,
 7: 1909,
 8: 1914,
 9: 1940,
 10: 1909,
 11: 1886,
 12: 839}

## Read files and change stations names to numbers

In [303]:
df_names = pd.read_csv('./networks/nodes.csv', usecols=['value', 'name'])
df = pd.read_csv('./networks/nodes_locations.csv', usecols=['name', 'lng', 'lat'])
df_edges = pd.read_csv('./plik.csv', usecols=['interval_start','interval_end','number_of_trips','rental_place','return_place'])
dict_names_temp = df_names['name'].to_dict()

In [304]:
dict_names_temp
dict_names = {}
for value, name in dict_names_temp.items():
    dict_names[name] = value
df_edges["rental_place"].replace(dict_names, inplace=True)
df_edges["return_place"].replace(dict_names, inplace=True)

df_edges

Unnamed: 0,interval_end,interval_start,number_of_trips,rental_place,return_place
0,2019-03-11 21:30:00,2019-03-11 21:15:00,1,18,18
1,2019-03-11 21:45:00,2019-03-11 21:30:00,1,18,18
2,2019-03-11 22:00:00,2019-03-11 21:45:00,1,18,18
3,2019-03-11 22:15:00,2019-03-11 22:00:00,1,18,18
4,2019-03-15 10:30:00,2019-03-15 10:15:00,4,196,196
...,...,...,...,...,...
67334,2019-04-01 00:00:00,2019-03-31 23:45:00,2,158,158
67335,2019-04-01 00:00:00,2019-03-31 23:45:00,1,168,168
67336,2019-04-01 00:00:00,2019-03-31 23:45:00,1,170,88
67337,2019-04-01 00:00:00,2019-03-31 23:45:00,1,170,141


## Get start and end intervals

In [305]:
df_edges['interval_start']= pd.to_datetime(df_edges['interval_start']) 
df_edges['interval_end']= pd.to_datetime(df_edges['interval_end'])

In [306]:
start = df_edges.interval_start.min()
start = start.replace(hour=0, minute=0, second=0)
start

Timestamp('2019-03-11 00:00:00')

In [307]:
end = df_edges.interval_end.max()
end = end.replace(hour=0, minute=0, second=0)
end

Timestamp('2019-04-01 00:00:00')

In [308]:
ranges = pd.date_range(start, end,freq='15T')
ranges

DatetimeIndex(['2019-03-11 00:00:00', '2019-03-11 00:15:00',
               '2019-03-11 00:30:00', '2019-03-11 00:45:00',
               '2019-03-11 01:00:00', '2019-03-11 01:15:00',
               '2019-03-11 01:30:00', '2019-03-11 01:45:00',
               '2019-03-11 02:00:00', '2019-03-11 02:15:00',
               ...
               '2019-03-31 21:45:00', '2019-03-31 22:00:00',
               '2019-03-31 22:15:00', '2019-03-31 22:30:00',
               '2019-03-31 22:45:00', '2019-03-31 23:00:00',
               '2019-03-31 23:15:00', '2019-03-31 23:30:00',
               '2019-03-31 23:45:00', '2019-04-01 00:00:00'],
              dtype='datetime64[ns]', length=2017, freq='15T')

## Testing for interval_start 2019-03-31 23:45:00

In [296]:
pd.DataFrame(df_edges[df_edges["interval_start"] == '2019-03-31 23:45:00'])
interval_edges = pd.DataFrame(df_edges[df_edges["interval_start"] == '2019-03-31 23:45:00'])
G = nx.DiGraph()
for index, node in df.iterrows():
    G.add_node(index, name=node.name, lat=node.lat, lng=node.lng)
for index, edge in interval_edges.iterrows():
    G.add_edge(edge.rental_place, edge.return_place, weight=edge.number_of_trips)
nodes_degrees = G.degree(weight='weight')
nodes_in_degrees = G.in_degree(weight='weight')
nodes_out_degrees = G.out_degree(weight='weight')

nodes_pageranks = nx.pagerank(G, weight='weight')
nodes_info = [dict(nodes_degrees).values(), dict(nodes_in_degrees).values(), dict(nodes_out_degrees).values(), nodes_pageranks.values()]
metrics_interval_df = pd.DataFrame({'node': list(G.nodes),
                      'degree': list(dict(nodes_degrees).values()),
                      'in_degree': list(dict(nodes_in_degrees).values()),
                      'out_degree': list(dict(nodes_out_degrees).values()),
                      'pagerank': list(nodes_pageranks.values())},
                    )
metrics_interval_df['interval_start'] = '2019-03-31 23:45:00'
metrics_interval_df['interval_end'] = '2019-04-01 00:00:00'

pd.set_option('display.max_rows', metrics_interval_df.shape[0]+1)
metrics_interval_df.style.hide_index()

bikes_in_use =  sum(metrics_interval_df.in_degree)
bikes_total = bikes_per_month[ranges[0].month]
bikes_percentage = bikes_in_use/bikes_total
bikes_usage_interval_df = pd.DataFrame({'interval_start': '2019-03-31 23:45:00',
                               'interval_end': '2019-04-01 00:00:00',
                               'bikes_in_use': bikes_in_use,
                               'bikes_total': bikes_total,
                               'bikes_percentage': bikes_percentage}, index = [1])
bikes_usage_interval_df
metrics_interval_df = metrics_interval_df[metrics_interval_df['degree'] > 0]

In [297]:
metrics_interval_df

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
1,1,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
5,5,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
16,16,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
26,26,1,1,0,0.00753,2019-03-31 23:45:00,2019-04-01 00:00:00
61,61,1,1,0,0.00753,2019-03-31 23:45:00,2019-04-01 00:00:00
67,67,1,0,1,0.00407,2019-03-31 23:45:00,2019-04-01 00:00:00
68,68,1,1,0,0.00753,2019-03-31 23:45:00,2019-04-01 00:00:00
88,88,1,1,0,0.0058,2019-03-31 23:45:00,2019-04-01 00:00:00
108,108,2,1,1,0.0271,2019-03-31 23:45:00,2019-04-01 00:00:00
109,109,2,1,1,0.0271,2019-03-31 23:45:00,2019-04-01 00:00:00


## Function for counting metrics and bikes_usage for each interval in month

In [309]:
def count_metrics(df, df_edges):
    bikes_usage_df = pd.DataFrame(columns = ['interval_start', 'interval_end', 'bikes_in_use', 'bikes_total', 'bikes_percentage'])
    metrics_df = pd.DataFrame(columns = ['node', 'degree', 'in_degree', 'out_degree', 'pagerank', 'interval_start', 'interval_end'])
    i = 0
    for interval in ranges:

        interval_end = interval + datetime.timedelta(minutes=15)

        if interval_end > end:
            break

        interval_edges = pd.DataFrame(df_edges[df_edges["interval_start"] == interval])
        G = nx.DiGraph()
        for index, node in df.iterrows():
            G.add_node(index, name=node.name, lat=node.lat, lng=node.lng)
        for index, edge in interval_edges.iterrows():
            G.add_edge(edge.rental_place, edge.return_place, weight=edge.number_of_trips)
        nodes_degrees = G.degree(weight='weight')
        nodes_in_degrees = G.in_degree(weight='weight')
        nodes_out_degrees = G.out_degree(weight='weight')

        nodes_pageranks = nx.pagerank(G, weight='weight')
        nodes_info = [dict(nodes_degrees).values(), dict(nodes_in_degrees).values(), dict(nodes_out_degrees).values(), nodes_pageranks.values()]
        metrics_interval_df = pd.DataFrame({'node': list(G.nodes),
                              'degree': list(dict(nodes_degrees).values()),
                              'in_degree': list(dict(nodes_in_degrees).values()),
                              'out_degree': list(dict(nodes_out_degrees).values()),
                              'pagerank': list(nodes_pageranks.values())},
                            )


        metrics_interval_df['interval_start'] = interval
        metrics_interval_df['interval_end'] = interval_end
        pd.set_option('display.max_rows', metrics_interval_df.shape[0]+1)

        bikes_in_use =  sum(metrics_interval_df.in_degree)
        bikes_total = bikes_per_month[interval.month]
        bikes_percentage = bikes_in_use/bikes_total
        bikes_usage_interval_df = pd.DataFrame({'interval_start': interval,
                                       'interval_end': interval_end,
                                       'bikes_in_use': bikes_in_use,
                                       'bikes_total': bikes_total,
                                       'bikes_percentage': bikes_percentage}, index = [i])
        i += 1
        bikes_usage_df = bikes_usage_df.append(bikes_usage_interval_df)
        metrics_interval_df = metrics_interval_df[metrics_interval_df['degree'] > 0]
        metrics_df = metrics_df.append(metrics_interval_df)
        
    return bikes_usage_df, metrics_df

In [310]:
bikes_usage, metrics = count_metrics(df, df_edges)

In [317]:
bikes_usage


Unnamed: 0,interval_start,interval_end,bikes_in_use,bikes_total,bikes_percentage
0,2019-03-11 00:00:00,2019-03-11 00:15:00,0,1027,0.00
1,2019-03-11 00:15:00,2019-03-11 00:30:00,0,1027,0.00
2,2019-03-11 00:30:00,2019-03-11 00:45:00,0,1027,0.00
3,2019-03-11 00:45:00,2019-03-11 01:00:00,0,1027,0.00
4,2019-03-11 01:00:00,2019-03-11 01:15:00,0,1027,0.00
...,...,...,...,...,...
2011,2019-03-31 22:45:00,2019-03-31 23:00:00,34,1027,0.03
2012,2019-03-31 23:00:00,2019-03-31 23:15:00,38,1027,0.04
2013,2019-03-31 23:15:00,2019-03-31 23:30:00,36,1027,0.04
2014,2019-03-31 23:30:00,2019-03-31 23:45:00,27,1027,0.03


In [314]:
metrics

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
18,18,2,1,1,3.12e-02,2019-03-11 21:15:00,2019-03-11 21:30:00
18,18,2,1,1,3.12e-02,2019-03-11 21:30:00,2019-03-11 21:45:00
18,18,2,1,1,3.12e-02,2019-03-11 21:45:00,2019-03-11 22:00:00
18,18,2,1,1,3.12e-02,2019-03-11 22:00:00,2019-03-11 22:15:00
196,196,8,4,4,3.12e-02,2019-03-15 10:15:00,2019-03-15 10:30:00
...,...,...,...,...,...,...,...
164,164,1,1,0,7.53e-03,2019-03-31 23:45:00,2019-04-01 00:00:00
168,168,2,1,1,2.71e-02,2019-03-31 23:45:00,2019-04-01 00:00:00
170,170,2,0,2,4.07e-03,2019-03-31 23:45:00,2019-04-01 00:00:00
171,171,1,0,1,4.07e-03,2019-03-31 23:45:00,2019-04-01 00:00:00


In [315]:
metrics.to_csv(join("metrics_test_03.csv"), index=False)

In [316]:
bikes_usage.to_csv(join("bikes_usage_test_03.csv"), index=False)