In [125]:
import numpy as np 
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Grouper
from scipy import stats
import pandas_profiling 

from bokeh.io import output_file, show, output_notebook, reset_output
from bokeh.layouts import column, row, gridplot, layout, widgetbox
from bokeh.models import Button, CheckboxGroup, ColumnDataSource, Slider, Div, DatetimeTickFormatter, Span
from bokeh.models.widgets import Toggle, CheckboxButtonGroup
from bokeh.palettes import Set3, Spectral6, Colorblind, Viridis
from bokeh.plotting import figure, curdoc

from bokeh.transform import factor_cmap
from bokeh.tile_providers import get_provider, Vendors
from bokeh.models.tools import HoverTool

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import geopy.distance
from pyproj import Proj, transform

# reset_output()
output_notebook()
curdoc().theme = 'dark_minimal'


In [3]:
renfe_csv = pd.read_csv('data/renfe.csv')

renfe_csv = renfe_csv.sample(n=20000, random_state=111) # TODO decidir si dejar o quitar

renfe_csv.head()

Unnamed: 0.1,Unnamed: 0,insert_date,origin,destination,start_date,end_date,train_type,price,train_class,fare
1470039,1470039,2019-05-08 04:54:40,VALENCIA,MADRID,2019-05-22 17:30:00,2019-05-23 00:12:00,MD-LD,28.1,Turista con enlace,Promo +
1199208,1199208,2019-05-02 23:04:01,MADRID,SEVILLA,2019-05-29 16:00:00,2019-05-29 18:30:00,AVE,69.4,Preferente,Promo
808095,808095,2019-04-26 01:22:17,SEVILLA,MADRID,2019-05-12 20:36:00,2019-05-12 23:14:00,ALVIA,,Turista,Flexible
1368935,1368935,2019-05-06 09:21:09,SEVILLA,MADRID,2019-05-06 17:45:00,2019-05-06 20:17:00,AVE,76.3,Turista,Flexible
263616,263616,2019-04-21 06:05:12,SEVILLA,MADRID,2019-04-26 13:40:00,2019-04-26 16:10:00,AVE,76.3,Turista,Flexible


In [4]:
# renaming and setting type to columns
renfe_csv.columns = ['id', 'insert_date', 'origin', 'destination', 'start_date',
       'end_date', 'train_type', 'price', 'train_class', 'fare']
renfe_csv = renfe_csv.drop('id',axis = 1)
renfe_csv['insert_date'] = pd.to_datetime(renfe_csv['insert_date'])
renfe_csv['end_date'] = pd.to_datetime(renfe_csv['end_date'])
renfe_csv['start_date'] = pd.to_datetime(renfe_csv['start_date'])
    
renfe_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 1470039 to 1400333
Data columns (total 9 columns):
insert_date    20000 non-null datetime64[ns]
origin         20000 non-null object
destination    20000 non-null object
start_date     20000 non-null datetime64[ns]
end_date       20000 non-null datetime64[ns]
train_type     20000 non-null object
price          17503 non-null float64
train_class    19928 non-null object
fare           19928 non-null object
dtypes: datetime64[ns](3), float64(1), object(5)
memory usage: 1.5+ MB


In [5]:
renfe_csv.isnull().mean()*100

insert_date     0.000
origin          0.000
destination     0.000
start_date      0.000
end_date        0.000
train_type      0.000
price          12.485
train_class     0.360
fare            0.360
dtype: float64

In [6]:
# price null values per month
renfe_csv[renfe_csv['price'].isnull()].groupby(Grouper(key='start_date' ,freq='M')).\
agg('count').apply(lambda x: 100*x/renfe_csv.shape[0]).reset_index().iloc[:,[0,2]]

Unnamed: 0,start_date,origin
0,2019-04-30,3.915
1,2019-05-31,8.08
2,2019-06-30,0.45
3,2019-07-31,0.04


In [7]:
# We drop null data, since we just want to comunicate
renfe_clean = renfe_csv[~renfe_csv['price'].isnull()]
renfe_clean.isnull().any()

insert_date    False
origin         False
destination    False
start_date     False
end_date       False
train_type     False
price          False
train_class    False
fare           False
dtype: bool

In [8]:
#Choice B: input mising data

# # using mode if we can
# renfe_csv['train_class'].fillna(rail_data['train_class'].mode()[0], inplace=True)
# renfe_csv['fare'].fillna(rail_data['fare'].mode()[0], inplace=True)
# # using mean if not
# renfe_csv.loc[renfe_csv.price.isnull(), 'price'] = renfe_csv.groupby('fare').price.transform('mean')
# renfe_clean = renfe_csv

In [9]:
# compute new usefull columns 
pd.options.mode.chained_assignment = None
renfe_clean['st_hour'] = renfe_clean['start_date'].dt.hour
renfe_clean['st_hour_counts'] = renfe_clean['st_hour'].map(dict(renfe_clean.st_hour.value_counts()))
renfe_clean['end_hour'] = renfe_clean['end_date'].dt.hour
renfe_clean['end_hour_counts'] = renfe_clean['end_hour'].map(dict(renfe_clean.end_hour.value_counts()))


renfe_clean['duration'] = ( renfe_clean['end_date'] - renfe_clean['start_date'] ) / np.timedelta64(1, 'm')
renfe_clean['duration_counts'] = renfe_clean['duration'].map(dict(renfe_clean.duration.value_counts()))

renfe_clean['weekday'] = renfe_clean['insert_date'].dt.weekday_name
renfe_clean['weekday_counts'] = renfe_clean['weekday'].map(dict(renfe_clean.weekday.value_counts()))

renfe_clean['st_monthday'] = renfe_clean['start_date'].dt.to_period('D')

renfe_clean['month'] = renfe_clean['insert_date'].dt.month
renfe_clean['month_counts'] = renfe_clean['month'].map(dict(renfe_clean.weekday.value_counts()))





In [29]:
# geo_loc = Nominatim(user_agent="practica_visualizacion")
# geocode = RateLimiter(geo_loc.geocode, min_delay_seconds=1)

# pos_lat_lon = {}

# for o in renfe_clean['origin'].unique():
#     loc=geocode(o)
#     pos_lat_lon[o]=(loc.latitude,loc.longitude)
    
# Obtenido de las lineas anteriores, se deja hardcoded porque en ocasiones se obtiene: geopy.exc.GeocoderUnavailable: Service not available
# De esta forma se evita el error en ejecuciones completas
pos_lat_lon = {'VALENCIA': (39.4699014, -0.3759513),
 'MADRID': (40.4167047, -3.7035825),
 'SEVILLA': (37.3886303, -5.9953403),
 'BARCELONA': (41.3828939, 2.1774322),
 'PONFERRADA': (42.5454124, -6.5938719)}

# get_distance = lambda coords_1, coords_2: geopy.distance.geodesic(coords_1, coords_2).km
travels_dist = {}
for k1, v1 in pos_lat_lon.items():
    for k2, v2 in pos_lat_lon.items():
        if k1 is not k2:
            dist_ = geopy.distance.geodesic(v1,v2).km
            travels_dist[f"{k1}-{k2}"] = dist_
            travels_dist[f"{k2}-{k1}"] = dist_
                    
# travels_dist

renfe_clean['distance'] = renfe_clean['origin'] + '-' + renfe_clean['destination']
renfe_clean['distance'] = renfe_clean['distance'].map(travels_dist)

pd.options.mode.chained_assignment = 'warn'

# for o in renfe_clean['origin'].unique():
#     loc=geocode(o)
#     lat[o],lon[o]=loc.latitude,loc.longitude
# lat,lon   
# renfe_clean['distance']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


NameError: name 'geocode' is not defined

In [288]:
# pos_lat_lon

{'VALENCIA': (39.4699014, -0.3759513),
 'MADRID': (40.4167047, -3.7035825),
 'SEVILLA': (37.3886303, -5.9953403),
 'BARCELONA': (41.3828939, 2.1774322),
 'PONFERRADA': (42.5454124, -6.5938719)}

In [13]:
# renfe_clean['travel_time_in_mins']
# renfe_clean = renfe_clean.sample(n=20000, random_state=111) # TODO decidir si dejar o quitar
# renfe_clean.iloc[:,9:11].plot.hist()


In [None]:
# count_  = renfe_clean['end_date'].dt.date.value_counts()
# count_ = count_[:50,]
# count_

SyntaxError: invalid syntax (<ipython-input-14-49f78907af7e>, line 2)

In [None]:
# pandas_profiling.ProfileReport(renfe_clean)

In [None]:
# TODO borrar?
def get_other_cols(df, filter_tar):
    filter_tar = [filter_tar] if type(filter_tar) is str else list(filter_tar)
    return list(filter(lambda x: False if x in list(filter_tar) else True,list(df.columns)))

In [11]:
# Bokeh global definitions
BASIC_TOOLTIP = [("(x,y)", "($x, $y)")]

HISTO_TOOLTIP = [("end_hour","@x"), ("top", "@top")]
TOOLS = ["pan,tap,box_select,box_zoom,lasso_select,wheel_zoom,save,reset"]

date_formttr = lambda x: DatetimeTickFormatter(
        hours=["%d-%B %Hh:%M"],
        days=["%d-%B"],
        months=["%B-%Y"],
        years=["%Y"],
    )

src_std_renfe = ColumnDataSource(data=renfe_clean.sample(n=10000, random_state=1))
std_height, std_width =450, 650


In [629]:

# date_formttr(1)
# renfe_clean[['start_date', 'price', 'end_date']][:100]
# print(renfe_clean.origin.unique())
# list(renfe_clean.columns)
# renfe_clean.columns
# 389911
# 389914

In [12]:
ts_col_selection = ['st_monthday', 'start_date', 'train_type','price','end_hour','duration', 'weekday', 'distance']
ts_src = ColumnDataSource(data=renfe_clean\
                          .sample(n=10000, random_state=1)[ts_col_selection]\
                          .groupby('st_monthday'))

In [13]:
# ts_src.to_df()['duration_mean'].map(pd.to_numeric).where(lambda x: x > 385).dropna().head(20)
# duration_mean: 399.904762
gpf_df = ts_src.to_df()

In [None]:
# ts_hover = HoverTool(
#     tooltips=[
#         ( 'date',   '@st_monthday{%F}'            ),
#         ( 'y',  '$y' ), # use @{ } for field names with spaces
# #         ( 'volume', '@volume{0.00 a}'      ),
#     ],

#     formatters={
# #         'date'      : 'datetime', # use 'datetime' formatter for 'date' field
# #         'adj close' : 'printf',   # use 'printf' formatter for 'adj close' field
#                                   # use default 'numeral' formatter for other fields
#     },

#     # display a tooltip whenever the cursor is vertically in line with a glyph
#     mode='vline'
# )


In [15]:
# OLD
# gen_dots = lambda var, color_: ts_plot.line('st_monthday', var, color=color_,
#                                             line_width=2,
#                                             legend=var+' ',
#                                             source = ts_src)

def get_gliph_and_butt(var_n, color_, active_=False):
    gliph =                 ts_plot.line('st_monthday', var_n, color=color_,
                                         line_width=2,
                                         legend=var_n+' ',
                                         visible=active_,
                                         source = ts_src)
    butt = Toggle(label=f"{var_n}", 
                  button_type="primary",
                  active=active_)
    butt.js_link('active', gliph, 'visible')
    return(gliph, butt)

In [467]:
print(ts_src.column_names)

['st_monthday', 'price_count', 'price_mean', 'price_std', 'price_min', 'price_25%', 'price_50%', 'price_75%', 'price_max', 'end_hour_count', 'end_hour_mean', 'end_hour_std', 'end_hour_min', 'end_hour_25%', 'end_hour_50%', 'end_hour_75%', 'end_hour_max', 'duration_count', 'duration_mean', 'duration_std', 'duration_min', 'duration_25%', 'duration_50%', 'duration_75%', 'duration_max', 'distance_count', 'distance_mean', 'distance_std', 'distance_min', 'distance_25%', 'distance_50%', 'distance_75%', 'distance_max']


In [211]:
# print(ts_src.column_names)

DATE_TOOLTIP = lambda var_n: [
#     ('date',   f'@{var_n}'),
    ("day_mean", "$y")]


ts_plot = figure(title="Time series",
                 x_axis_type='datetime',
                 y_axis_type='log',
                 tools=TOOLS,
                 tooltips=DATE_TOOLTIP('st_monthday'),
                 height=round(std_height*1.3), width=round(std_width*2.35))

# ts_plot.add_tools(ts_hover)
# ploting_cat = renfe_clean['origin'].unique()


                                
butt_list = []
butt_appender = lambda tup: (tup[0], butt_list.append(tup[1]))

(pm, _) = butt_appender(get_gliph_and_butt('price_count', 'white', True))


(pm, _) = butt_appender(get_gliph_and_butt('price_mean', 'lightcoral', True))
(dm, _) = butt_appender(get_gliph_and_butt('duration_mean', 'lightblue', True))
(dim, _) = butt_appender(get_gliph_and_butt('distance_mean', 'gold', False))


(p5, _) = butt_appender(get_gliph_and_butt('price_50%', 'firebrick', False))
(d5, _) = butt_appender(get_gliph_and_butt('duration_50%', 'RoyalBlue', False))
(di5, _) = butt_appender(get_gliph_and_butt('distance_50%', 'goldenrod', False))

# greenyellow, green
(psd, _) = butt_appender(get_gliph_and_butt('price_std', 'OrangeRed', False))
(dsd, _) = butt_appender(get_gliph_and_butt('duration_std', 'SteelBlue', False))
(disd, _) = butt_appender(get_gliph_and_butt('distance_std', 'SandyBrown', False))
# d_5.visible=False

ts_span = Span(location=1559606400000,
                            dimension='height', line_color='ghostwhite',
                            line_dash='dashed', line_width=3)
ts_plot.add_layout(ts_span)

ts_plot.xaxis.formatter = date_formttr(_)
ts_plot.xaxis.major_label_orientation = 3.14/4
ts_plot.legend.orientation = "horizontal"
ts_plot.legend.location = "bottom_left"

txt_div = Div(text="""
<h4> Press buttons to toggle metrics visibility on/off</h4>
"""
          )
# output_file("toggle.html")

butt_chunks = [row(butt_list[x:x+5]) for x in range(0, len(butt_list), 5)]


# show(column([row(butt_list), ts_plot]))
rows_to_shw = butt_chunks + [txt_div]
rows_to_shw.append(ts_plot)


show(column(rows_to_shw))

In [18]:
plot_a = figure(title="Plot",x_axis_type='datetime',
              tools=TOOLS,
              height=std_height, width=std_width)

ploting_cat = renfe_clean['origin'].unique()

r2 = plot_a.circle('start_date', 'price',
                 legend='origin',  
                 color=factor_cmap('origin', palette=Colorblind[len(ploting_cat)],
                                        factors=ploting_cat),
                 source = src_std_renfe)

plot_a.xaxis.formatter = date_formttr(_)
plot_a.xaxis.major_label_orientation = 3.14/4




# show(plot_a)

In [23]:
end_hour_bar = figure(title="Plot", height=std_height, width=std_width,
                     tools=TOOLS
#                       tooltips=HISTO_TOOLTIP
                     )

# counts_ = renfe_clean.end_hour.value_counts()
# end_hour_bar.vbar(x=counts_.index,
#                   top=counts_.values, width=0.9, alpha=0.7)


df = src_std_renfe.to_df()

counts_ = df.end_hour.value_counts()
end_hour_bar.vbar(x=counts_.index, top=counts_.values,
                  color='firebrick', width=0.9, alpha=0.7)


# show(end_hour_bar)

In [24]:
# TODO definir funcion para hacer histogramas de cualquier variable
end_hour_bar = figure(title="Plot", height=std_height, width=std_width, tools=TOOLS)

end_hour_bar.vbar(
                  source = src_std_renfe, x='end_hour', top='end_hour_counts',
                  width=0.9, alpha=1, color='firebrick')

# show(end_hour_bar)

In [None]:
# show(gridplot([[end_hour_bar, plot]]))

In [None]:




# renfe_clean['end_hour_counts'] = renfe_clean['end_hour'].map(dict(renfe_clean.end_hour.value_counts()))
# # .value_counts()
# renfe_clean[['end_hour_counts','end_hour']].head(5)

In [None]:
# dict(renfe_clean.end_hour.value_counts())
# # xxx1=sorted(renfe_clean['end_hour'].unique())
# xxx2=renfe_clean.end_hour.value_counts().sort()
# # xxx1
# xxx2

In [None]:
# [1,3,2]

In [19]:
# lat

{'VALENCIA': 39.4699014,
 'MADRID': 40.4167047,
 'SEVILLA': 37.3886303,
 'BARCELONA': 41.3828939,
 'PONFERRADA': 42.5454124}

In [27]:
# lat,lon
# p.circle(list(lat.values()), list(lon.values()))
# def my_trans(pos_pair):
#     outProj = Proj(init='epsg:3857')
#     inProj = Proj(init='epsg:4326')
#     y1,x1 = list(lat.values())[1],list(lon.values())[1]
#     x2,y2 = transform(inProj,outProj,x1,y1)
#     print(x2,y2)

In [35]:
def my_trans(pos_pair):
    outProj = Proj(init='epsg:3857')
    inProj = Proj(init='epsg:4326')
    y1,x1 = pos_pair[0],pos_pair[1]
    x2,y2 = transform(inProj,outProj,x1,y1)
    return x2,y2

In [32]:
pos_lat_lon

{'VALENCIA': (39.4699014, -0.3759513),
 'MADRID': (40.4167047, -3.7035825),
 'SEVILLA': (37.3886303, -5.9953403),
 'BARCELONA': (41.3828939, 2.1774322),
 'PONFERRADA': (42.5454124, -6.5938719)}

In [36]:
pos_mercator = {}
lat_merc = {}
lon_merc = {}
for k,v in pos_lat_lon.items():
    v1, v2 = my_trans(v)
    lat_merc[k] = v1
    lon_merc[k] = v2


d_list = [lat_merc,lon_merc,
      dict(renfe_clean.origin.value_counts()),
      dict(renfe_clean.destination.value_counts())]
for k in d_list[0].keys():
  pos_mercator[k] = tuple(d[k] for d in d_list)
# d
pos_df = pd.DataFrame.from_dict(pos_mercator, orient='index')
pos_df = pos_df.reset_index()
pos_df.columns = ['city', 'lat', 'lon','origin_cnt', 'dest_cnt']
# Multiply to show in map
pos_df['origin_cnt_m'] = pos_df.origin_cnt*15
pos_df['dest_cnt_m'] = pos_df.dest_cnt*15

# pos_df

In [217]:

# Origin Map
MAP_TOOLTIP = lambda var: [("City","@city"), ("Count", f'@{var}')]
tile_provider = get_provider(Vendors.CARTODBPOSITRON_RETINA)
pos_src = ColumnDataSource(pos_df)

# range bounds supplied in web mercator coordinates
origin_map = figure(title="Origin Map",
                    tools=TOOLS,
                    height=std_height, width=std_width,
                    tooltips=MAP_TOOLTIP('origin_cnt'),
                    x_range=(-1200000, 700000), y_range=(4500000, 5100000),
#                     x_axis_type="mercator", y_axis_type="mercator"
                   )
origin_map.add_tile(tile_provider)

origin_map.circle('lat','lon',
                  radius='origin_cnt_m',
                  source=pos_src,
                  alpha=0.6,
                  color='black',
                  fill_color='firebrick'
                 )
origin_map.axis.visible = False

# Destination Map
dest_map = figure(title="Destination Map",
                    tools=TOOLS,
                    height=std_height, width=std_width,
                    tooltips=MAP_TOOLTIP('dest_cnt'),
                    x_range=origin_map.x_range, y_range=origin_map.y_range,
#                     x_axis_type="mercator", y_axis_type="mercator"
                 )
dest_map.add_tile(tile_provider)
dest_map.circle('lat','lon',
                  radius='dest_cnt_m',
                  source=pos_src,
                  alpha=0.6,
                  color='black',
                  fill_color='navy'
                 )
dest_map.axis.visible = False

# show(gridplot(([[origin_map],[dest_map]])))


In [220]:
cat_ = pos_df['city'].unique()
# Origin Histogram
origin_hist = figure(x_range=cat_, y_range=(0,12000),
                    tools=TOOLS,
                    tooltips=MAP_TOOLTIP('origin_cnt'),
                     plot_height=std_height,plot_width=round(std_width.65),
                     title="Origin Histogram")

origin_hist.vbar(x='city', top='origin_cnt',
       width=0.8, 
       color=factor_cmap('city', palette=Colorblind[len(cat_)], factors=cat_),
       source=pos_src)

origin_hist.xgrid.grid_line_color = None
origin_hist.xaxis.major_label_orientation = 3.14/4

# Destination Histogram
dest_hist = figure(x_range=cat_, y_range=(0,12000),
                    tools=TOOLS,
                    tooltips=MAP_TOOLTIP('origin_cnt'),
                   plot_height=std_height,plot_width=round(std_width*.65),
                   title="Destination Histogram")

dest_hist.vbar(x='city', top='dest_cnt',
       width=0.8, 
       color=factor_cmap('city', palette=Colorblind[len(cat_)], factors=cat_),
       source=pos_src)

dest_hist.xgrid.grid_line_color = None
dest_hist.xaxis.major_label_orientation = 3.14/4

# show(gridplot(([[origin_hist, dest_hist]])))

SyntaxError: invalid syntax (<ipython-input-220-994d7db2a9a6>, line 6)

In [222]:
values_ = renfe_clean['train_type'].value_counts()
tr_type_ttip = [("City","@x_range"), ("Count", '$y')]
# Destination Histogram
tr_type_hist = figure(x_range=list(values_.index), y_range=(1,max(values_)+500),
                      
#                     tools=TOOLS,
#                     tooltips=tr_type_ttip,
                   plot_height=round(std_height),plot_width=round(std_width),
                   title="Train Type Histogram")

tr_type_hist.vbar(x=values_.index, top=values_.values,
       width=0.8, 
#        color=factor_cmap('city', palette=Colorblind[len(cat_)], factors=cat_),
       )

tr_type_hist.xgrid.grid_line_color = None
tr_type_hist.xaxis.major_label_orientation = 3.14/4
# show(tr_type_hist)

In [223]:
# Origin / Destination layout
# output_file(0)
# show(gridplot(([[origin_map],[dest_map,origin_hist, dest_hist]])))
# show(row(origin_map,tr_type_hist))
# show(row(dest_map,origin_hist, dest_hist))
show(gridplot([[row(origin_map,tr_type_hist)], [row(dest_map,origin_hist, dest_hist)]]))
# show(row(origin_map,dest_map,dest_hist, origin_hist))


In [41]:
src_df = src_std_renfe.to_df()
# src_df.columns

st_h_cnt = src_df['st_hour'].value_counts()
end_h_cnt = src_df['end_hour'].value_counts()

In [577]:
# src_df['st_hour'].value_counts()[30]

KeyError: 30

In [42]:
h_keys = [i for i in range(24)]
st_hour_cnt = [st_h_cnt.get(i,0) for i in h_keys]
end_hour_cnt = [end_h_cnt.get(i,0) for i in h_keys]
src_std_2 = ColumnDataSource({'hour':h_keys, 'st_cnt':st_hour_cnt, 'end_cnt':end_hour_cnt})

In [69]:
def get_gliph(src, plot, var_n, color_, active_=False):
    circ =                 plot.circle('hour', var_n, color=color_,
                                         radius=.2,
#                                          legend=var_n+' ',
                                         visible=active_,
                                         source = src_std_2)
    line =                 plot.line('hour', var_n, color=color_,
                                         line_width=.6,
#                                          legend=var_n+' ',
                                     line_dash='dashed',
                                         visible=active_,
                                         source = src)
#     butt = Toggle(label=f"{var_n}", 
#                   button_type="primary",
#                   active=active_)
#     butt.js_link('active', gliph, 'visible')
#     return gliph

In [70]:
h_tootltip = [
    ('Hour',   '$xh'),
    ("Count", "$y")]

st_plot = figure(title="Start Hour Counts",
                 x_axis_type='datetime',
#                  y_axis_type='log',
                 tools=TOOLS,
                 tooltips=h_tootltip,
                 height=round(std_height*.8), width=round(std_width*2.35))

get_gliph(src_std_2, st_plot, 'st_cnt','lightcoral', True)

end_plot = figure(title="End Hour Counts",
                 x_axis_type='datetime',
                  x_range=st_plot.x_range, y_range=st_plot.y_range,
#                  y_axis_type='log',
                 tools=TOOLS,
                 tooltips=h_tootltip,
                 height=round(std_height*.8), width=round(std_width*2.35))

get_gliph(src_std_2, end_plot, 'end_cnt','SkyBlue', True)

hr_frtr = DatetimeTickFormatter(
milliseconds = ['%3Nh', '%S.%3Ns']
    )
st_plot.xaxis.formatter = hr_frtr
end_plot.xaxis.formatter = hr_frtr

show(gridplot([[st_plot], [end_plot]]))
# show(st_plot)


In [193]:
# grp_wd_eh = renfe_clean.groupby(['weekday','distance'])[['weekday','distance']]
# w_end_hour_src = ColumnDataSource(grp_wd_eh)
# grp_wd_sh = renfe_clean.groupby(['weekday','distance'])[['weekday','distance']]
# w_st_hour_src = ColumnDataSource(grp_wd_sh)
# grp_wd_eh[['weekday', 'end_hour']]
rp_wd_eh = renfe_clean.groupby('weekday')[['weekday','distance']]
w_end_hour_src = ColumnDataSource(rp_wd_eh)
w_end_hour_src.column_names
w_end_hour_src.to_df().head(15)

Unnamed: 0,weekday,distance_count,distance_mean,distance_std,distance_min,distance_25%,distance_50%,distance_75%,distance_max
0,Friday,1993.0,401.084997,85.082976,303.177513,303.177513,390.504192,506.922016,506.922016
1,Monday,2933.0,403.344425,85.842082,303.177513,303.177513,390.504192,506.922016,506.922016
2,Saturday,2039.0,399.823272,84.399222,303.177513,303.177513,390.504192,506.922016,506.922016
3,Sunday,2900.0,401.601734,84.596708,303.177513,303.177513,390.504192,506.922016,506.922016
4,Thursday,2322.0,409.043852,84.700591,303.177513,303.177513,390.504192,506.922016,506.922016
5,Tuesday,2784.0,404.725246,86.726516,303.177513,303.177513,390.504192,506.922016,506.922016
6,Wednesday,2532.0,403.174348,84.516889,303.177513,303.177513,390.504192,506.922016,506.922016


In [186]:
# w_st_hour_src_df = w_st_hour_src.to_df()
# w_st_hour_src_df['weekday'] = w_st_hour_src_df['weekday_st_hour'].map(lambda x: x[0])
# w_st_hour_src_df['st_hour'] = w_st_hour_src_df['weekday_st_hour'].map(lambda x: x[1])
# # w_st_hour_src_df.head(2)
# w_st_hour_src = ColumnDataSource(w_st_hour_src_df)
# w_st_hour_src.column_names

w_end_hour_src_df = w_end_hour_src.to_df()
w_end_hour_src_df['weekday'] = w_end_hour_src_df['weekday_distance'].map(lambda x: x[0])
w_end_hour_src_df['distance'] = w_end_hour_src_df['weekday_distance'].map(lambda x: x[1])
# w_st_hour_src_df.head(2)
w_end_hour_src = ColumnDataSource(w_end_hour_src_df)
# w_end_hour_src.column_names


Unnamed: 0,weekday_distance,distance_count,distance_mean,distance_std,distance_min,distance_25%,distance_50%,distance_75%,distance_max,weekday,distance
0,"(Friday, 303.17751344295993)",633.0,303.177513,0.0,303.177513,303.177513,303.177513,303.177513,303.177513,Friday,303.177513
1,"(Friday, 337.84652093217073)",104.0,337.846521,5.711869e-14,337.846521,337.846521,337.846521,337.846521,337.846521,Friday,337.846521
2,"(Friday, 390.50419160167525)",553.0,390.504192,0.0,390.504192,390.504192,390.504192,390.504192,390.504192,Friday,390.504192
3,"(Friday, 506.92201575586023)",703.0,506.922016,5.688389e-14,506.922016,506.922016,506.922016,506.922016,506.922016,Friday,506.922016
4,"(Monday, 303.17751344295993)",897.0,303.177513,5.687513e-14,303.177513,303.177513,303.177513,303.177513,303.177513,Monday,303.177513


In [187]:
w_end_hour_src_df.head(20)

Unnamed: 0,weekday_distance,distance_count,distance_mean,distance_std,distance_min,distance_25%,distance_50%,distance_75%,distance_max,weekday,distance
0,"(Friday, 303.17751344295993)",633.0,303.177513,0.0,303.177513,303.177513,303.177513,303.177513,303.177513,Friday,303.177513
1,"(Friday, 337.84652093217073)",104.0,337.846521,5.711869e-14,337.846521,337.846521,337.846521,337.846521,337.846521,Friday,337.846521
2,"(Friday, 390.50419160167525)",553.0,390.504192,0.0,390.504192,390.504192,390.504192,390.504192,390.504192,Friday,390.504192
3,"(Friday, 506.92201575586023)",703.0,506.922016,5.688389e-14,506.922016,506.922016,506.922016,506.922016,506.922016,Friday,506.922016
4,"(Monday, 303.17751344295993)",897.0,303.177513,5.687513e-14,303.177513,303.177513,303.177513,303.177513,303.177513,Monday,303.177513
5,"(Monday, 337.84652093217073)",196.0,337.846521,5.698898e-14,337.846521,337.846521,337.846521,337.846521,337.846521,Monday,337.846521
6,"(Monday, 390.50419160167525)",755.0,390.504192,5.68811e-14,390.504192,390.504192,390.504192,390.504192,390.504192,Monday,390.504192
7,"(Monday, 506.92201575586023)",1085.0,506.922016,5.686963e-14,506.922016,506.922016,506.922016,506.922016,506.922016,Monday,506.922016
8,"(Saturday, 303.17751344295993)",646.0,303.177513,0.0,303.177513,303.177513,303.177513,303.177513,303.177513,Saturday,303.177513
9,"(Saturday, 337.84652093217073)",111.0,337.846521,1.713036e-13,337.846521,337.846521,337.846521,337.846521,337.846521,Saturday,337.846521


In [205]:
w_st_hour_src_df['weekday'].unique()

array(['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
       'Wednesday'], dtype=object)

In [201]:
weekdat_cat = w_st_hour_src_df['weekday'].unique()
def get_gliph2(src, plot_, x_var_n, var_n, color_, active_=False):
    circ =                 plot_.vbar(x_var_n, top=var_n, 
                                        color=color_,
                                      width=.7,
                                      alpha=.8,
#                                          radius=.12,
#                                          legend='weekday',
                                         visible=active_,
                                        fill_color=factor_cmap('weekday', palette=Viridis[7],
                                        factors=weekdat_cat),
                                         source = src)
#     line =                 plot_.line(x_var_n, var_n, 
#                                       line_color=factor_cmap('weekday', palette=Colorblind[7],
#                                         factors=weekdat_cat),
#                                          line_width=.6,
#                                       legend='weekday',
#                                       line_dash='dashed',
#                                          visible=active_,
#                                          source = src)

In [206]:
h_tootltip = [
    ('Hour',   '$xh'),
    ("Count", "$y")]

# w_st_plot = figure(title="Start Hour Counts by Weekday",
# #                  x_axis_type='datetime',
# #                  y_axis_type='log',
#                  tools=TOOLS,
# #                  tooltips=h_tootltip,
#                  height=round(std_height*.8), width=round(std_width*2.35))

# get_gliph2(w_st_hour_src, w_st_plot,'st_hour', 'st_hour_count','black', True)

# # ts_plot.legend.orientation = "horizontal"
# w_st_plot.legend.location = "top_left"



w_end_plot = figure(title="Total distance by Weekday",
#                  x_axis_type='datetime',
                  x_range=['Monday', 'Tuesday', 'Wednesday','Thursday', 'Friday','Saturday', 'Sunday'],
#                     , y_range=w_st_plot.y_range,
#                  x_axis_type='datetime',
#                  y_axis_type='log',
                 tools=TOOLS,
#                  tooltips=h_tootltip,
                 height=round(std_height*1.3), width=round(std_width*2.35))
get_gliph2(w_end_hour_src, w_end_plot,'weekday', 'distance_count','black', True)
# w_end_plot.legend.location = "top_left"



w_st_plot.xaxis.formatter = hr_frtr
# w_end_plot.xaxis.formatter = hr_frtr

# ts_plot.legend.orientation = "horizontal"
# ts_plot.legend.location = "bottom_left"

# get_gliph(w_end_hour_src, w_end_plot, 'end_cnt','SkyBlue', True)

show(w_end_plot)
# show(gridplot([[w_st_plot], [w_end_plot]]))

## Localizacion Espacial

In [224]:
show(gridplot([[row(origin_map,tr_type_hist)], [row(dest_map,origin_hist, dest_hist)]]))

## Anális de horas de salida y entrada

In [171]:
show(gridplot([[st_plot], [end_plot]]))


## Distancia recorrida por día de la semana

In [207]:
show(w_end_plot)

In [225]:
show(column(rows_to_shw))