# Parsing metrics into json

## Imports

In [189]:
import pandas as pd
from os.path import join
from tqdm.notebook import tqdm

## Load csv

In [200]:
df = pd.read_csv(join("metrics", "historia_przejazdow_2019-07.csv"), parse_dates=True)
df

Unnamed: 0,node,degree,in_degree,out_degree,pagerank,interval_start,interval_end
0,16,2,0,2,0.004316,2019-07-01 00:00:00,2019-07-01 00:15:00
1,18,1,1,0,0.007985,2019-07-01 00:00:00,2019-07-01 00:15:00
2,20,2,1,1,0.028527,2019-07-01 00:00:00,2019-07-01 00:15:00
3,32,1,1,0,0.007985,2019-07-01 00:00:00,2019-07-01 00:15:00
4,35,2,0,2,0.004316,2019-07-01 00:00:00,2019-07-01 00:15:00
...,...,...,...,...,...,...,...
334025,176,1,1,0,0.006738,2019-07-31 23:45:00,2019-08-01 00:00:00
334026,187,2,2,0,0.007952,2019-07-31 23:45:00,2019-08-01 00:00:00
334027,190,1,0,1,0.003642,2019-07-31 23:45:00,2019-08-01 00:00:00
334028,196,2,1,1,0.024212,2019-07-31 23:45:00,2019-08-01 00:00:00


## Remove unnecessary columns

In [201]:
df = df[["interval_start", "node", "degree", "in_degree", "out_degree", "pagerank"]]
# df = df.rename(columns={"interval_start":"s","rental_place":"o","return_place":"d","number_of_trips":"c"})
df['interval_start']= pd.to_datetime(df['interval_start']) 
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,interval_start,node,degree,in_degree,out_degree,pagerank
0,2019-07-01 00:00:00,16,2,0,2,0.004316
1,2019-07-01 00:00:00,18,1,1,0,0.007985
2,2019-07-01 00:00:00,20,2,1,1,0.028527
3,2019-07-01 00:00:00,32,1,1,0,0.007985
4,2019-07-01 00:00:00,35,2,0,2,0.004316
...,...,...,...,...,...,...
334025,2019-07-31 23:45:00,176,1,1,0,0.006738
334026,2019-07-31 23:45:00,187,2,2,0,0.007952
334027,2019-07-31 23:45:00,190,1,0,1,0.003642
334028,2019-07-31 23:45:00,196,2,1,1,0.024212


## Add day column

In [202]:
df["day"] = df["interval_start"].dt.day
df

Unnamed: 0,interval_start,node,degree,in_degree,out_degree,pagerank,day
0,2019-07-01 00:00:00,16,2,0,2,0.004316,1
1,2019-07-01 00:00:00,18,1,1,0,0.007985,1
2,2019-07-01 00:00:00,20,2,1,1,0.028527,1
3,2019-07-01 00:00:00,32,1,1,0,0.007985,1
4,2019-07-01 00:00:00,35,2,0,2,0.004316,1
...,...,...,...,...,...,...,...
334025,2019-07-31 23:45:00,176,1,1,0,0.006738,31
334026,2019-07-31 23:45:00,187,2,2,0,0.007952,31
334027,2019-07-31 23:45:00,190,1,0,1,0.003642,31
334028,2019-07-31 23:45:00,196,2,1,1,0.024212,31


## Change hour to a "minute in day" form

In [203]:
df["minute_in_day"] = df["interval_start"].dt.hour*60 + df["interval_start"].dt.minute
df

Unnamed: 0,interval_start,node,degree,in_degree,out_degree,pagerank,day,minute_in_day
0,2019-07-01 00:00:00,16,2,0,2,0.004316,1,0
1,2019-07-01 00:00:00,18,1,1,0,0.007985,1,0
2,2019-07-01 00:00:00,20,2,1,1,0.028527,1,0
3,2019-07-01 00:00:00,32,1,1,0,0.007985,1,0
4,2019-07-01 00:00:00,35,2,0,2,0.004316,1,0
...,...,...,...,...,...,...,...,...
334025,2019-07-31 23:45:00,176,1,1,0,0.006738,31,1425
334026,2019-07-31 23:45:00,187,2,2,0,0.007952,31,1425
334027,2019-07-31 23:45:00,190,1,0,1,0.003642,31,1425
334028,2019-07-31 23:45:00,196,2,1,1,0.024212,31,1425


## Count the number of days in this month

In [204]:
days_in_month = df["interval_start"].dt.daysinmonth.max()
days_in_month

31

## Remove unnecessary columns and rename rest  with {"node":"o", "degree":"k", "in_degree":"ik", "out_degree":"ok", "pagerank":"p"}

In [205]:
df = df[["day", "minute_in_day", "node", "degree", "in_degree", "out_degree", "pagerank"]]

df = df.rename(columns={"node":"o", "degree":"k", "in_degree":"ik", "out_degree":"ok", "pagerank":"p"})
df

Unnamed: 0,day,minute_in_day,o,k,ik,ok,p
0,1,0,16,2,0,2,0.004316
1,1,0,18,1,1,0,0.007985
2,1,0,20,2,1,1,0.028527
3,1,0,32,1,1,0,0.007985
4,1,0,35,2,0,2,0.004316
...,...,...,...,...,...,...,...
334025,31,1425,176,1,1,0,0.006738
334026,31,1425,187,2,2,0,0.007952
334027,31,1425,190,1,0,1,0.003642
334028,31,1425,196,2,1,1,0.024212


## Example for a single day

In [206]:
(df[df.day==31])[["minute_in_day", "o", "k", "ik", "ok", "p"]].groupby('minute_in_day').apply(lambda g: g[["o", "k", "ik", "ok", "p"]].to_dict(orient='records')).to_dict()

{0: [{'o': 1, 'k': 4, 'ik': 2, 'ok': 2, 'p': 0.03064356956114678},
  {'o': 2, 'k': 1, 'ik': 1, 'ok': 0, 'p': 0.0042259256426322115},
  {'o': 3, 'k': 2, 'ik': 2, 'ok': 0, 'p': 0.004646072803122904},
  {'o': 4, 'k': 2, 'ik': 2, 'ok': 0, 'p': 0.006326661445085681},
  {'o': 5, 'k': 1, 'ik': 0, 'ok': 1, 'p': 0.00296548416116013},
  {'o': 13, 'k': 1, 'ik': 0, 'ok': 1, 'p': 0.00296548416116013},
  {'o': 14, 'k': 1, 'ik': 1, 'ok': 0, 'p': 0.0054863671241042935},
  {'o': 16, 'k': 5, 'ik': 1, 'ok': 4, 'p': 0.0037657901848445275},
  {'o': 18, 'k': 5, 'ik': 2, 'ok': 3, 'p': 0.01304661736934249},
  {'o': 19, 'k': 1, 'ik': 1, 'ok': 0, 'p': 0.0054863671241042935},
  {'o': 27, 'k': 1, 'ik': 0, 'ok': 1, 'p': 0.00296548416116013},
  {'o': 32, 'k': 1, 'ik': 1, 'ok': 0, 'p': 0.006166708255897718},
  {'o': 34, 'k': 4, 'ik': 2, 'ok': 2, 'p': 0.027469938137188017},
  {'o': 38, 'k': 2, 'ik': 0, 'ok': 2, 'p': 0.00296548416116013},
  {'o': 40, 'k': 1, 'ik': 0, 'ok': 1, 'p': 0.00296548416116013},
  {'o': 41, 'k'

## Loop for each day

In [207]:
month_dict = {}

for day in range(1, days_in_month+1):
    dict_for_current_day = (df[df.day==day])[["minute_in_day", "o", "k", "ik", "ok", "p"]].groupby('minute_in_day').apply(lambda g: g[["o", "k", "ik", "ok", "p"]].to_dict(orient='records')).to_dict()
    
    month_dict[day] = dict_for_current_day

## Save as json

In [208]:
import json

with open('07.json', 'w') as fp:
    json.dump(month_dict, fp)