## Preprocess data into 15 intervals

## Imports

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools
from IPython.display import Image
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import glob

from os.path import join

from processing_helper import extract_data


%matplotlib inline
pd.set_option("display.precision", 2)

In [2]:
PATH = join("data", "historia_przejazdow_2019-03.csv")
PATH

'data/historia_przejazdow_2019-03.csv'

In [3]:
df = pd.read_csv(PATH, index_col=0, parse_dates=True)

df.head()

Unnamed: 0,uid,bike_number,start_time,end_time,rental_place,return_place
0,64961590,57006,2019-03-01 14:29:06,2019-03-01 14:29:44,Poza stacją,Poza stacją
1,64961652,57006,2019-03-01 14:30:16,2019-03-01 14:33:00,Poza stacją,Poza stacją
2,64963262,57006,2019-03-01 15:03:42,2019-03-01 15:08:55,Poza stacją,Poza stacją
3,64719784,72558,2019-02-20 12:15:34,2019-03-03 01:39:33,Poza stacją,Poza stacją
4,65041465,57006,2019-03-04 06:16:44,2019-03-04 06:18:10,Poza stacją,Poza stacją


In [4]:
df['start_time']= pd.to_datetime(df['start_time']) 
df['end_time']= pd.to_datetime(df['end_time'])
df.head()

Unnamed: 0,uid,bike_number,start_time,end_time,rental_place,return_place
0,64961590,57006,2019-03-01 14:29:06,2019-03-01 14:29:44,Poza stacją,Poza stacją
1,64961652,57006,2019-03-01 14:30:16,2019-03-01 14:33:00,Poza stacją,Poza stacją
2,64963262,57006,2019-03-01 15:03:42,2019-03-01 15:08:55,Poza stacją,Poza stacją
3,64719784,72558,2019-02-20 12:15:34,2019-03-03 01:39:33,Poza stacją,Poza stacją
4,65041465,57006,2019-03-04 06:16:44,2019-03-04 06:18:10,Poza stacją,Poza stacją


## Divide in intervals

### Earliest rental 

In [5]:
start = df.start_time.min()
start = start.replace(hour=0, minute=0, second=0)
start

Timestamp('2019-02-20 00:00:00')

### Latest return

In [6]:
end = df.end_time.max()
end = end.replace(hour=0, minute=0, second=0)
end

Timestamp('2019-04-01 00:00:00')

In [7]:
ranges = pd.date_range(start, end,freq='15T')
ranges

DatetimeIndex(['2019-02-20 00:00:00', '2019-02-20 00:15:00',
               '2019-02-20 00:30:00', '2019-02-20 00:45:00',
               '2019-02-20 01:00:00', '2019-02-20 01:15:00',
               '2019-02-20 01:30:00', '2019-02-20 01:45:00',
               '2019-02-20 02:00:00', '2019-02-20 02:15:00',
               ...
               '2019-03-31 21:45:00', '2019-03-31 22:00:00',
               '2019-03-31 22:15:00', '2019-03-31 22:30:00',
               '2019-03-31 22:45:00', '2019-03-31 23:00:00',
               '2019-03-31 23:15:00', '2019-03-31 23:30:00',
               '2019-03-31 23:45:00', '2019-04-01 00:00:00'],
              dtype='datetime64[ns]', length=3841, freq='15T')

In [8]:
all_trips_in_all_intervals_df  = pd.DataFrame(columns=["interval_start", "interval_end", "rental_place", "return_place"])
all_trips_in_all_intervals_df   

Unnamed: 0,interval_start,interval_end,rental_place,return_place


In [9]:
for i in range(len(ranges)-1):
    interval_start, interval_end = ranges[i],ranges[i+1]
    
    interval_df = df[(df.start_time <= interval_end) & (df.end_time >= interval_start)]    
    
    
    interval_df["interval_start"] = interval_start
    interval_df["interval_end"] = interval_end
    
    all_trips_in_all_intervals_df = all_trips_in_all_intervals_df.append(interval_df[["interval_start", "interval_end", "rental_place", "return_place"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interval_df["interval_start"] = interval_start
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interval_df["interval_end"] = interval_end


In [10]:
all_trips_in_all_intervals_df

Unnamed: 0,interval_start,interval_end,rental_place,return_place
3,2019-02-20 12:15:00,2019-02-20 12:30:00,Poza stacją,Poza stacją
3,2019-02-20 12:30:00,2019-02-20 12:45:00,Poza stacją,Poza stacją
3,2019-02-20 12:45:00,2019-02-20 13:00:00,Poza stacją,Poza stacją
3,2019-02-20 13:00:00,2019-02-20 13:15:00,Poza stacją,Poza stacją
3,2019-02-20 13:15:00,2019-02-20 13:30:00,Poza stacją,Poza stacją
...,...,...,...,...
44360,2019-03-31 23:45:00,2019-04-01 00:00:00,Aleja Hallera / Mielecka,Grabiszyńska / Aleja Hallera
44361,2019-03-31 23:45:00,2019-04-01 00:00:00,Plac Powstańców Warszawy (Muzeum Narodowe),Szczęśliwa (Sky Tower)
44362,2019-03-31 23:45:00,2019-04-01 00:00:00,Plac Powstańców Warszawy (Muzeum Narodowe),Szczęśliwa (Sky Tower)
44372,2019-03-31 23:45:00,2019-04-01 00:00:00,Poza stacją,Plac Grunwaldzki / Polaka


## Group by rentsl/return place

In [11]:
intervals_grouped_df = pd.DataFrame(columns=["interval_start", "interval_end", "number_of_trips", "rental_place", "return_place"])

intervals_grouped_df

Unnamed: 0,interval_start,interval_end,number_of_trips,rental_place,return_place


In [16]:
for i in range(len(ranges)-1):
    interval_start, interval_end = ranges[i],ranges[i+1]
    
    interval_group_df = (all_trips_in_all_intervals_df[all_trips_in_all_intervals_df["interval_start"] == interval_start]).groupby(['rental_place', 'return_place'], as_index=False).count()

    interval_group_df = interval_group_df.rename(columns={"interval_start": "number_of_trips"})

    interval_group_df["interval_start"] = interval_start
    interval_group_df["interval_end"] = interval_end
    
    intervals_grouped_df = intervals_grouped_df.append(interval_group_df)

In [21]:
intervals_grouped_df

Unnamed: 0,interval_start,interval_end,number_of_trips,rental_place,return_place
0,2019-02-20 12:15:00,2019-02-20 12:30:00,1,Poza stacją,Poza stacją
0,2019-02-20 12:30:00,2019-02-20 12:45:00,1,Poza stacją,Poza stacją
0,2019-02-20 12:45:00,2019-02-20 13:00:00,1,Poza stacją,Poza stacją
0,2019-02-20 13:00:00,2019-02-20 13:15:00,1,Poza stacją,Poza stacją
0,2019-02-20 13:15:00,2019-02-20 13:30:00,1,Poza stacją,Poza stacją
...,...,...,...,...,...
19,2019-03-31 23:45:00,2019-04-01 00:00:00,1,Wróblewskiego (Teki),Wróblewskiego (Teki)
20,2019-03-31 23:45:00,2019-04-01 00:00:00,1,Wróblewskiego (ZOO),Poza stacją
21,2019-03-31 23:45:00,2019-04-01 00:00:00,1,Wyszyńskiego / Szczytnicka,Nowowiejska / Górnickiego
22,2019-03-31 23:45:00,2019-04-01 00:00:00,1,Wyszyńskiego / Szczytnicka,Sienkiewicza / Wyszyńskiego


## Save to CSV

In [22]:
intervals_grouped_df.to_csv(join("plik.csv"), index=False)