# Parsing intervals into json

## Imports

In [1]:
import pandas as pd
from os.path import join
from tqdm.notebook import tqdm

## Load csv

In [2]:
df = pd.read_csv(join("groupedby_intervals", "historia_przejazdow_2019-07.csv"), parse_dates=True)
df

Unnamed: 0,interval_start,interval_end,number_of_trips,rental_place,return_place
0,2019-07-01 00:00:00,2019-07-01 00:15:00,2,16,38
1,2019-07-01 00:00:00,2019-07-01 00:15:00,1,20,20
2,2019-07-01 00:00:00,2019-07-01 00:15:00,1,35,41
3,2019-07-01 00:00:00,2019-07-01 00:15:00,1,35,113
4,2019-07-01 00:00:00,2019-07-01 00:15:00,1,58,129
...,...,...,...,...,...
466072,2019-07-30 23:45:00,2019-07-31 00:00:00,2,187,202
466073,2019-07-30 23:45:00,2019-07-31 00:00:00,1,188,136
466074,2019-07-30 23:45:00,2019-07-31 00:00:00,1,191,18
466075,2019-07-30 23:45:00,2019-07-31 00:00:00,1,200,200


## Remove unnecessary columns

In [3]:
df = df[["interval_start", "rental_place", "return_place", "number_of_trips"]]
# df = df.rename(columns={"interval_start":"s","rental_place":"o","return_place":"d","number_of_trips":"c"})
df['interval_start']= pd.to_datetime(df['interval_start']) 
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['interval_start']= pd.to_datetime(df['interval_start'])


Unnamed: 0,interval_start,rental_place,return_place,number_of_trips
0,2019-07-01 00:00:00,16,38,2
1,2019-07-01 00:00:00,20,20,1
2,2019-07-01 00:00:00,35,41,1
3,2019-07-01 00:00:00,35,113,1
4,2019-07-01 00:00:00,58,129,1
...,...,...,...,...
466072,2019-07-30 23:45:00,187,202,2
466073,2019-07-30 23:45:00,188,136,1
466074,2019-07-30 23:45:00,191,18,1
466075,2019-07-30 23:45:00,200,200,1


## Add day column

In [4]:
df["day"] = df["interval_start"].dt.day
df

Unnamed: 0,interval_start,rental_place,return_place,number_of_trips,day
0,2019-07-01 00:00:00,16,38,2,1
1,2019-07-01 00:00:00,20,20,1,1
2,2019-07-01 00:00:00,35,41,1,1
3,2019-07-01 00:00:00,35,113,1,1
4,2019-07-01 00:00:00,58,129,1,1
...,...,...,...,...,...
466072,2019-07-30 23:45:00,187,202,2,30
466073,2019-07-30 23:45:00,188,136,1,30
466074,2019-07-30 23:45:00,191,18,1,30
466075,2019-07-30 23:45:00,200,200,1,30


## Change hour to a "minute in day" form

In [5]:
df["minute_in_day"] = df["interval_start"].dt.hour*60 + df["interval_start"].dt.minute
df

Unnamed: 0,interval_start,rental_place,return_place,number_of_trips,day,minute_in_day
0,2019-07-01 00:00:00,16,38,2,1,0
1,2019-07-01 00:00:00,20,20,1,1,0
2,2019-07-01 00:00:00,35,41,1,1,0
3,2019-07-01 00:00:00,35,113,1,1,0
4,2019-07-01 00:00:00,58,129,1,1,0
...,...,...,...,...,...,...
466072,2019-07-30 23:45:00,187,202,2,30,1425
466073,2019-07-30 23:45:00,188,136,1,30,1425
466074,2019-07-30 23:45:00,191,18,1,30,1425
466075,2019-07-30 23:45:00,200,200,1,30,1425


## Count the number of days in this month

In [6]:
days_in_month = df["interval_start"].dt.daysinmonth.max()
days_in_month

31

## Remove unnecessary columns and rename rest  with {"rental_place":"o","return_place":"d", "number_of_trips":"c"}

In [7]:
df = df[["day", "minute_in_day", "rental_place", "return_place", "number_of_trips"]]

df = df.rename(columns={"rental_place":"o","return_place":"d", "number_of_trips":"c"})
df

Unnamed: 0,day,minute_in_day,o,d,c
0,1,0,16,38,2
1,1,0,20,20,1
2,1,0,35,41,1
3,1,0,35,113,1
4,1,0,58,129,1
...,...,...,...,...,...
466072,30,1425,187,202,2
466073,30,1425,188,136,1
466074,30,1425,191,18,1
466075,30,1425,200,200,1


## Example for a single day

In [8]:
(df[df.day==1])[["minute_in_day", "o", "d", "c"]].groupby('minute_in_day').apply(lambda g: g[["o", "d", "c"]].sort_values(by=['c'], ascending=False).to_dict(orient='records')).to_dict()

{0: [{'o': 16, 'd': 38, 'c': 2},
  {'o': 125, 'd': 131, 'c': 2},
  {'o': 188, 'd': 77, 'c': 2},
  {'o': 182, 'd': 176, 'c': 2},
  {'o': 172, 'd': 61, 'c': 2},
  {'o': 108, 'd': 58, 'c': 2},
  {'o': 126, 'd': 49, 'c': 2},
  {'o': 188, 'd': 188, 'c': 1},
  {'o': 178, 'd': 196, 'c': 1},
  {'o': 154, 'd': 97, 'c': 1},
  {'o': 141, 'd': 32, 'c': 1},
  {'o': 125, 'd': 195, 'c': 1},
  {'o': 125, 'd': 142, 'c': 1},
  {'o': 112, 'd': 112, 'c': 1},
  {'o': 20, 'd': 20, 'c': 1},
  {'o': 111, 'd': 91, 'c': 1},
  {'o': 108, 'd': 153, 'c': 1},
  {'o': 108, 'd': 108, 'c': 1},
  {'o': 103, 'd': 18, 'c': 1},
  {'o': 74, 'd': 72, 'c': 1},
  {'o': 60, 'd': 68, 'c': 1},
  {'o': 58, 'd': 129, 'c': 1},
  {'o': 35, 'd': 113, 'c': 1},
  {'o': 35, 'd': 41, 'c': 1},
  {'o': 200, 'd': 121, 'c': 1}],
 15: [{'o': 139, 'd': 104, 'c': 3},
  {'o': 77, 'd': 148, 'c': 3},
  {'o': 16, 'd': 28, 'c': 2},
  {'o': 69, 'd': 69, 'c': 2},
  {'o': 126, 'd': 49, 'c': 2},
  {'o': 16, 'd': 38, 'c': 2},
  {'o': 89, 'd': 163, 'c': 2

## Loop for each day

In [9]:
month_dict = {}

for day in range(1, days_in_month+1):
    dict_for_current_day = (df[df.day==day])[["minute_in_day", "o", "d", "c"]].groupby('minute_in_day').apply(lambda g: g[["o", "d", "c"]].sort_values(by=['c'], ascending=False).to_dict(orient='records')).to_dict()
    
    month_dict[day] = dict_for_current_day

## Save as json

In [10]:
import json

with open('07.json', 'w') as fp:
    json.dump(month_dict, fp)