In [9]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import seaborn as sns
import altair as alt

DIR_ROOT = os.path.join(pathlib.Path().absolute(), '../../..' )

In [10]:
# list of all metrices used in the analysis
metrics = [
    #'act_day', 
    #'act_stay',
    'act_sum',
    'day_sum',
    'stay_avg',
    'stay_sum',
    
    'travel',
 
    'centr_closes',
    'centr_between',
    'centr_eigen',
    'centr_infos',
    
    'infl_dist',
    'infl_reg',
    'infl_sum',
]

In [11]:
# merge all metrices with the list of destinations

df = pd.read_csv(DIR_ROOT + '/data/02_processed/destinations.csv', index_col='name')

for metric in metrics:
    df_metric = pd.read_csv(DIR_ROOT + '/data/05_metrics/' + metric + '.csv', index_col=0)
    df_metric.rename(columns={
       'p1': metric + '_p1', 
       'p2': metric + '_p2',
       'p3': metric + '_p3',
       'p4': metric + '_p4',
       'p5': metric + '_p5',
       'pall': metric + '',
    }, inplace=True)
    df_metric.drop(columns=['geometry'], inplace=True) 
    df_metric = (df_metric - df_metric.min()) / (df_metric.max() - df_metric.min())

    # merge to the table with all destinations
    df = pd.concat([df, df_metric], axis=1, join="inner")

In [12]:
df['median'] = df.apply(lambda x: x[metrics].median(), axis=1)
df['median_p1'] = df.apply(lambda x: x[[m + '_p1' for m in metrics]].median(), axis=1)
df['median_p2'] = df.apply(lambda x: x[[m + '_p2' for m in metrics]].median(), axis=1)
df['median_p3'] = df.apply(lambda x: x[[m + '_p3' for m in metrics]].median(), axis=1)
df['median_p4'] = df.apply(lambda x: x[[m + '_p4' for m in metrics]].median(), axis=1)
df['median_p5'] = df.apply(lambda x: x[[m + '_p5' for m in metrics]].median(), axis=1)
df.sort_values('median', ascending=False)

Unnamed: 0.1,Unnamed: 0,id,no_alle,no_activity,no_reise,no_bi,no_gg,x,y,prazision,...,infl_sum_p3,infl_sum_p4,infl_sum_p5,infl_sum,median,median_p1,median_p2,median_p3,median_p4,median_p5
Praha,419,421.0,401.0,172.0,176.0,44.0,9.0,50.090,14.410,1.0,...,1.000000,1.000000,1.000000,1.000000,1.000000,0.211052,0.971662,1.000000,1.000000,1.000000
Wien,631,632.0,271.0,116.0,124.0,23.0,8.0,48.221,16.388,1.0,...,0.358025,0.893333,,0.723005,0.672913,0.718664,0.863019,0.325564,0.635837,0.000000
Brno,45,44.0,103.0,42.0,47.0,10.0,4.0,49.190,16.610,1.0,...,0.123457,0.413333,0.411765,0.300469,0.421920,0.000000,0.313725,0.169520,0.497014,0.546875
Krems an der Donau,228,229.0,73.0,30.0,30.0,6.0,7.0,48.410,15.620,1.0,...,,0.026667,,0.173709,0.252561,0.966667,0.530044,0.000000,0.114588,0.000000
Písek,395,396.0,28.0,13.0,13.0,1.0,1.0,49.309,14.148,1.0,...,0.148148,0.093333,,0.117371,0.251821,0.000000,0.225850,0.308691,0.127721,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Marchegg,279,280.0,7.0,2.0,2.0,2.0,1.0,48.257,16.891,1.0,...,,,,,0.001594,0.000000,0.000000,0.000000,0.006889,0.000000
Maribor,282,283.0,11.0,5.0,5.0,0.0,1.0,46.554,15.648,1.0,...,,,,,0.000675,0.000000,0.000000,0.000000,0.004455,0.000000
Seitenstetten,495,498.0,5.0,1.0,1.0,3.0,0.0,48.036,14.655,1.0,...,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Čáslav,53,52.0,5.0,1.0,1.0,2.0,1.0,49.910,15.389,1.0,...,,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
corr = df[metrics].corr()
# corr.mean())

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(15,10))
cscheme = cm.get_cmap('Reds', 256)
newcolors = cscheme(np.linspace(0, 1, 256))

#for metric in corr.index:
#    corr.at[metric, metric] = (corr.sum()[metric] - 1) / len(metrics)

black = np.array([0.4,0.4,0.4,1])

# white color at the beginning of the scale, black at the end
newcolors[255:256, :] = black

newcmp = ListedColormap(newcolors)

sns.heatmap(corr, cmap=newcmp, annot=True)

In [14]:
corr.index = corr.index + str(corr.mean()[corr.index])

In [15]:
# get order for each metric

df_matrices_order = pd.DataFrame(index=df.index)

for metric in metrics:
    df_copy = df.copy()
    df_copy.sort_values(metric, inplace=True, ascending=False)
    df_copy.reset_index(inplace=True)
    df_copy['rank'] = df_copy.index.values
    df_copy.set_index('index', inplace=True)
    df_matrices_order[metric] = df_copy[['rank']] + 1

df_matrices_order.loc['Praha']

act_sum           1
day_sum           1
stay_avg          8
stay_sum          1
travel            1
centr_closes      8
centr_between     2
centr_eigen       1
centr_infos       1
infl_dist        25
infl_reg          1
infl_sum          1
Name: Praha, dtype: int64

In [19]:
max_rank = 10

# get list of important places
imp_places = []
for metric in metrics:
    imp_places_m = df_matrices_order[df_matrices_order[metric] <= max_rank].index
    imp_places.extend(imp_places_m)

imp_places = np.unique(np.array(imp_places))


imp_places = df_matrices_order.loc[imp_places]
imp_places['name'] = imp_places.index

alt.renderers.set_embed_options(
    padding={"left": 5, "right": 10, "bottom": 5, "top": 0}
)

rank_chart = alt.Chart(imp_places).transform_window(
    index='count()'
).transform_fold(
    metrics
).properties(
    width=1000,
    height=600
).encode(
    color=alt.Color('name:N', legend=None),
    x=alt.X(
        'key:N',
        axis=alt.Axis(title='metrics'),
        sort=metrics
    ),
    y=alt.Y(
        'value:Q', 
        scale=alt.Scale(zero=False, domain=[max_rank + 0.1,0.1], nice=False),
        axis=alt.Axis(values=[i for i in range(1, max_rank+1)], tickMinStep=0.1, title='rank', format='.0f'),
    ),
)

lines = rank_chart.mark_line(
    clip=True,
    strokeWidth=.5,
    strokeDash=[5,3]
)

circles = rank_chart.mark_circle(
    clip=True,
    size=100,
    strokeOpacity=1,
    strokeWidth=1.5
).encode(
    stroke=alt.Color('name:N', legend=None),
)

labels = rank_chart.mark_text(
    clip=True,
    align='center',
    baseline='middle',
    dy=-15,
    fontWeight='bold',
    fill='black',
).encode(
    y=alt.Y('value:Q'),
    text='name:N',
)

lines + circles + labels

In [17]:
time_medians = ['median_p1', 'median_p2', 'median_p3', 'median_p4', 'median_p5']

time_df = df[time_medians + ['region1', 'median']]
time_df['name'] = time_df.index
time_df.rename(columns= {
    'median_p1': 'period 1',
    'median_p2': 'period 2', 
    'median_p3': 'period 3', 
    'median_p4': 'period 4', 
    'median_p5': 'period 5'
}, inplace=True)
time_medians = ['period 1', 'period 2', 'period 3', 'period 4', 'period 5']

locations_by_median = time_df.sort_values(['median'], inplace=True, ascending=False)

time_df = time_df[time_df['median'] > 0.05]

time_chart = alt.Chart(time_df).transform_window(
    index='count()'
).transform_fold(
    time_medians
).encode(
    color=alt.Color('name:N', legend=None),
    x=alt.X(
        'name:N',
        axis=alt.Axis(title='localities'),
        sort=locations_by_median
    ),
    y=alt.Y(
        'key:N',
        scale=alt.Scale(zero=False, domain=time_medians, nice=False),
    ),
)

circles = time_chart.mark_circle(
    clip=True,
    strokeOpacity=1,
    strokeWidth=1.5
).encode(
    size=alt.Size('value:Q', legend=None),
    color=alt.Color('region1:N', legend=None),
)

circles

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [18]:
df.to_csv(DIR_ROOT + '/data/06_outputs/localities.csv')