In [11]:
import pandas as pd
import eurostat

# Import data using the Eurostat API
#Short-stay accommodation offered via collaborative economy platforms 
#by months, residence of the guest and NUTS 1 and NUTS 2 region - experimental statistics
data = eurostat.get_data_df('TOUR_CE_OMN12')

data

Unnamed: 0,freq,indic_to,c_resid,month,unit,geo\TIME_PERIOD,2018,2019,2020,2021,2022,2023,2024
0,A,LSTY,DOM,M01,NR,AT,23783.0,27923.0,34156.0,17680.0,42738.0,47964.0,53079.0
1,A,LSTY,DOM,M01,NR,AT1,8096.0,9799.0,11922.0,11908.0,15822.0,17954.0,20805.0
2,A,LSTY,DOM,M01,NR,AT11,239.0,374.0,563.0,123.0,433.0,762.0,902.0
3,A,LSTY,DOM,M01,NR,AT12,790.0,1323.0,1524.0,829.0,2096.0,2655.0,2956.0
4,A,LSTY,DOM,M01,NR,AT13,7067.0,8102.0,9835.0,10956.0,13293.0,14537.0,16947.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45040,A,STY,TOTAL,TOTAL,NR,SK0,201570.0,282091.0,164077.0,143318.0,260114.0,377298.0,
45041,A,STY,TOTAL,TOTAL,NR,SK01,83713.0,127479.0,44507.0,38707.0,72293.0,123117.0,
45042,A,STY,TOTAL,TOTAL,NR,SK02,16825.0,22662.0,15411.0,16703.0,29347.0,40974.0,
45043,A,STY,TOTAL,TOTAL,NR,SK03,57106.0,68598.0,58203.0,47117.0,87260.0,111667.0,


In [12]:
#check all parameter in dataset
pars = eurostat.get_pars('TOUR_CE_OMN12')
pars

['freq', 'indic_to', 'c_resid', 'month', 'unit', 'geo']

In [13]:
#Drop columns'freq','unit'
data.drop(columns=['freq','unit'], errors='ignore', inplace=True)

#add column geo_layer to label geographic regions (NUTS codes).
df = pd.DataFrame(data)
df['geo_layer'] = df['geo\TIME_PERIOD'].apply(lambda x: 'NUT1' if len(x) == 3 else ('NUT2' if len(x) == 4 else ('Country' if len(x) == 2 else '[EU27_2020]')))
#put column "geo" besides column "geo_layer" 
col_order = ['geo\TIME_PERIOD', 'geo_layer'] + [col for col in df.columns if col not in ['geo\TIME_PERIOD', 'geo_layer']]
df = df[col_order]

#Drop row where 'c_resid' and 'month' value is total
df = df[df['c_resid'] != 'TOTAL']
df = df[df['month'] != 'TOTAL']

#Change colomn label for 'geo\TIME_PERIOD' to 'geo'
df.columns.values[df.columns.get_loc('geo\TIME_PERIOD')] = 'geo'

df

Unnamed: 0,geo,geo_layer,indic_to,c_resid,month,2018,2019,2020,2021,2022,2023,2024
0,AT,Country,LSTY,DOM,M01,23783.0,27923.0,34156.0,17680.0,42738.0,47964.0,53079.0
1,AT1,NUT1,LSTY,DOM,M01,8096.0,9799.0,11922.0,11908.0,15822.0,17954.0,20805.0
2,AT11,NUT2,LSTY,DOM,M01,239.0,374.0,563.0,123.0,433.0,762.0,902.0
3,AT12,NUT2,LSTY,DOM,M01,790.0,1323.0,1524.0,829.0,2096.0,2655.0,2956.0
4,AT13,NUT2,LSTY,DOM,M01,7067.0,8102.0,9835.0,10956.0,13293.0,14537.0,16947.0
...,...,...,...,...,...,...,...,...,...,...,...,...
39650,SK0,NUT1,STY,FOR,M12,11065.0,15080.0,1110.0,2980.0,11906.0,18620.0,
39651,SK01,NUT2,STY,FOR,M12,6485.0,9270.0,463.0,1307.0,5237.0,8896.0,
39652,SK02,NUT2,STY,FOR,M12,563.0,815.0,82.0,210.0,875.0,1296.0,
39653,SK03,NUT2,STY,FOR,M12,2485.0,2787.0,294.0,879.0,3265.0,4891.0,


In [14]:
long_df = pd.melt(df, id_vars=['geo', 'geo_layer', 'indic_to', 'c_resid', 'month'], value_vars=['2018', '2019', '2020', '2021', '2022', '2023', '2024'], var_name='Year', value_name='Value') 
long_df

Unnamed: 0,geo,geo_layer,indic_to,c_resid,month,Year,Value
0,AT,Country,LSTY,DOM,M01,2018,23783.0
1,AT1,NUT1,LSTY,DOM,M01,2018,8096.0
2,AT11,NUT2,LSTY,DOM,M01,2018,239.0
3,AT12,NUT2,LSTY,DOM,M01,2018,790.0
4,AT13,NUT2,LSTY,DOM,M01,2018,7067.0
...,...,...,...,...,...,...,...
194035,SK0,NUT1,STY,FOR,M12,2024,
194036,SK01,NUT2,STY,FOR,M12,2024,
194037,SK02,NUT2,STY,FOR,M12,2024,
194038,SK03,NUT2,STY,FOR,M12,2024,


In [15]:
#check whether there's special character like space in the value of dataset

print("Unique values in 'indic_to' column:", data['indic_to'].unique())
print("Unique values in 'c_resid' column:", data['c_resid'].unique())
print("Unique values in 'month' column:", data['month'].unique())
print("Unique values in 'TIME_PERIOD' column:", data['geo\TIME_PERIOD'].unique())


Unique values in 'indic_to' column: ['LSTY' 'NGT_SP' 'STY']
Unique values in 'c_resid' column: ['DOM' 'FOR' 'TOTAL']
Unique values in 'month' column: ['M01' 'M02' 'M03' 'M04' 'M05' 'M06' 'M07' 'M08' 'M09' 'M10' 'M11' 'M12'
 'TOTAL']
Unique values in 'TIME_PERIOD' column: ['AT' 'AT1' 'AT11' 'AT12' 'AT13' 'AT2' 'AT21' 'AT22' 'AT3' 'AT31' 'AT32'
 'AT33' 'AT34' 'BE' 'BE1' 'BE10' 'BE2' 'BE21' 'BE22' 'BE23' 'BE24' 'BE25'
 'BE3' 'BE31' 'BE32' 'BE33' 'BE34' 'BE35' 'BG' 'BG3' 'BG31' 'BG32' 'BG33'
 'BG34' 'BG4' 'BG41' 'BG42' 'CH' 'CH0' 'CH01' 'CH02' 'CH03' 'CH04' 'CH05'
 'CH06' 'CH07' 'CY' 'CY0' 'CY00' 'CZ' 'CZ0' 'CZ01' 'CZ02' 'CZ03' 'CZ04'
 'CZ05' 'CZ06' 'CZ07' 'CZ08' 'DE' 'DE1' 'DE11' 'DE12' 'DE13' 'DE14' 'DE2'
 'DE21' 'DE22' 'DE23' 'DE24' 'DE25' 'DE26' 'DE27' 'DE3' 'DE30' 'DE4'
 'DE40' 'DE5' 'DE50' 'DE6' 'DE60' 'DE7' 'DE71' 'DE72' 'DE73' 'DE8' 'DE80'
 'DE9' 'DE91' 'DE92' 'DE93' 'DE94' 'DEA' 'DEA1' 'DEA2' 'DEA3' 'DEA4'
 'DEA5' 'DEB' 'DEB1' 'DEB2' 'DEB3' 'DEC' 'DEC0' 'DED' 'DED2' 'DED4' 'DED5'


In [16]:
#Check null value amount for each column
null_counts = long_df.isnull().sum()
print(null_counts)

geo              0
geo_layer        0
indic_to         0
c_resid          0
month            0
Year             0
Value        20886
dtype: int64


In [17]:
#Analyse null value distribution
#most of null value are in 2024,so we will not cover 2024 for later analyse
#For year 2018-2024,the null value amount is small, so we will drop it for later analyse

rows_2024 = long_df[long_df['Year'] == '2024']
rows_2023 = long_df[long_df['Year'] == '2023']
rows_2022 = long_df[long_df['Year'] == '2022']
rows_2021 = long_df[long_df['Year'] == '2021']
rows_2020 = long_df[long_df['Year'] == '2020']
rows_2019 = long_df[long_df['Year'] == '2019']
rows_2018 = long_df[long_df['Year'] == '2018']
missing_value_count_2024 = rows_2024['Value'].isna().sum()
missing_value_count_2023 = rows_2023['Value'].isna().sum()
missing_value_count_2022 = rows_2022['Value'].isna().sum()
missing_value_count_2021 = rows_2021['Value'].isna().sum()
missing_value_count_2020 = rows_2020['Value'].isna().sum()
missing_value_count_2019 = rows_2019['Value'].isna().sum()
missing_value_count_2018 = rows_2018['Value'].isna().sum()
print(f"Number of rows where Value is NaN for Year 2024: {missing_value_count_2024}")
print(f"Number of rows where Value is NaN for Year 2023: {missing_value_count_2023}")
print(f"Number of rows where Value is NaN for Year 2022: {missing_value_count_2022}")
print(f"Number of rows where Value is NaN for Year 2021: {missing_value_count_2021}")
print(f"Number of rows where Value is NaN for Year 2020: {missing_value_count_2020}")
print(f"Number of rows where Value is NaN for Year 2019: {missing_value_count_2019}")
print(f"Number of rows where Value is NaN for Year 2018: {missing_value_count_2018}")

Number of rows where Value is NaN for Year 2024: 20790
Number of rows where Value is NaN for Year 2023: 9
Number of rows where Value is NaN for Year 2022: 9
Number of rows where Value is NaN for Year 2021: 21
Number of rows where Value is NaN for Year 2020: 15
Number of rows where Value is NaN for Year 2019: 21
Number of rows where Value is NaN for Year 2018: 21


In [18]:
# Drop rows with any missing values
long_df.dropna(inplace=True)
# Drop rows where Year is 2024
long_df = long_df[long_df['Year'] != '2024']
#long_file = 'long_df_flask.csv'
#long_df.to_csv(long_file, index=False)
long_df

Unnamed: 0,geo,geo_layer,indic_to,c_resid,month,Year,Value
0,AT,Country,LSTY,DOM,M01,2018,23783.0
1,AT1,NUT1,LSTY,DOM,M01,2018,8096.0
2,AT11,NUT2,LSTY,DOM,M01,2018,239.0
3,AT12,NUT2,LSTY,DOM,M01,2018,790.0
4,AT13,NUT2,LSTY,DOM,M01,2018,7067.0
...,...,...,...,...,...,...,...
166315,SK0,NUT1,STY,FOR,M12,2023,18620.0
166316,SK01,NUT2,STY,FOR,M12,2023,8896.0
166317,SK02,NUT2,STY,FOR,M12,2023,1296.0
166318,SK03,NUT2,STY,FOR,M12,2023,4891.0


In [19]:
#Summarize data by month, guest residence, or geo_layer
summary_df = long_df.groupby(['month', 'c_resid', 'indic_to', 'geo_layer','Year']).agg({
    'Value': 'sum',
}).reset_index()
#summary_file = 'summary_df_flask.csv'
#summary_df.to_csv(summary_file, index=False)
summary_df

Unnamed: 0,month,c_resid,indic_to,geo_layer,Year,Value
0,M01,DOM,LSTY,Country,2018,2076982.0
1,M01,DOM,LSTY,Country,2019,2517764.0
2,M01,DOM,LSTY,Country,2020,3342210.0
3,M01,DOM,LSTY,Country,2021,2425589.0
4,M01,DOM,LSTY,Country,2022,4108208.0
...,...,...,...,...,...,...
1723,M12,FOR,STY,[EU27_2020],2019,1563624.0
1724,M12,FOR,STY,[EU27_2020],2020,171960.0
1725,M12,FOR,STY,[EU27_2020],2021,893514.0
1726,M12,FOR,STY,[EU27_2020],2022,1548701.0


In [20]:
#This part is code for visualization with dropdown component
#Chart including:
#1.Time-series plots to show trends in bookings over months;
#2.Bar charts to compare the share of local vs. international guests; 
#3.Pie charts to further compare the share of local vs. international guests;
#4.Heatmaps to visualize the concentration of short-stay accommodations by region;
#5.Top 10 popular destination analysis by mutiple geo layer;
#6.Animations to show changes over time.
#7.Forecast of "Length of Stay" by random forest model
#8.Forecast of "Stay" by random forest model
#9.Forecast of "Night Spent" by random forest model

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import plotly.express as px
import json

# Convert the month column to a category type and specify the order of the categories
month_order = [f"M{str(i).zfill(2)}" for i in range(1, 13)]
summary_df['month'] = pd.Categorical(summary_df['month'], categories=month_order, ordered=True)

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Interactive Charts"),

    # Line chart controls
    html.H2("Line Chart Controls"),
    dcc.Dropdown(
        id='geo_layer-dropdown',
        options=[{'label': geo_layer, 'value': geo_layer} for geo_layer in summary_df['geo_layer'].unique()],
        value=summary_df['geo_layer'].unique()[0]
    ),
    dcc.Dropdown(
        id='indic_to-dropdown',
        options=[
            {'label': 'Length of Stay', 'value': 'LSTY'},
            {'label': 'Stay', 'value': 'STY'},
            {'label': 'Night Spent', 'value': 'NGT_SP'}
        ],
        value='LSTY'  
    ),
    dcc.Dropdown(
        id='c_resid-dropdown',
        options=[{'label': 'All', 'value': 'All'}] + [{'label': c_resid, 'value': c_resid} for c_resid in summary_df['c_resid'].unique()],
        value='All'
    ),
    dcc.Graph(id='line-chart'),

    # Bar chart controls
    html.H2("Bar Chart Controls"),
    dcc.Dropdown(
        id='indic_to_filter',
        options=[
            {'label': 'Length of Stay', 'value': 'LSTY'},
            {'label': 'Stay', 'value': 'STY'},
            {'label': 'Night Spent', 'value': 'NGT_SP'}
        ],
        value='LSTY',  
        multi=False,
        placeholder="Select Indicator"
    ),
    dcc.Dropdown(
        id='year_filter',
        options=[{'label': year, 'value': year} for year in summary_df['Year'].unique()],
        value=summary_df['Year'].unique()[0],
        multi=False,
        placeholder="Select Year"
    ),
    dcc.Dropdown(
        id='geo_layer_filter',
        options=[{'label': geo, 'value': geo} for geo in summary_df['geo_layer'].unique()],
        value=summary_df['geo_layer'].unique()[0],
        multi=False,
        placeholder="Select Geographical Layer"
    ),
    dcc.Graph(id='bar_chart'),

    # New Pie Chart Controls
    html.H2("Pie Chart Controls"),
    dcc.Dropdown(
        id='pie_chart_indic_to',
        options=[
            {'label': 'Length of Stay', 'value': 'LSTY'},
            {'label': 'Stay', 'value': 'STY'},
            {'label': 'Night Spent', 'value': 'NGT_SP'}
        ],
        value='LSTY'  
    ),
    dcc.Dropdown(
        id='pie_chart_geo_layer',
        options=[{'label': geo_layer, 'value': geo_layer} for geo_layer in summary_df['geo_layer'].unique()],
        value=summary_df['geo_layer'].unique()[0],
        multi=False,
        placeholder="Select Geographical Layer"
    ),
    dcc.Dropdown(
        id='pie_chart_year',
        options=[{'label': year, 'value': year} for year in summary_df['Year'].unique()],
        value=summary_df['Year'].unique()[0],
        multi=False,
        placeholder="Select Year"
    ),
 
    dcc.Dropdown(
        id='pie_chart_month',
        options=[
            {'label': 'All', 'value': 'All'}  
        ] + [{'label': month, 'value': month} for month in summary_df['month'].unique()],  # Remaining month options
        value='All',  # All is checked by default
        multi=False,
        placeholder="Select Month"
    ),
    dcc.Graph(id='pie_chart'),# New graph for the pie chart

    # New heatmap control
    html.H2("Geographical Heatmap Controls"),
    dcc.Dropdown(
        id='heatmap_indic_to',
        options=[
            {'label': 'Length of Stay', 'value': 'LSTY'},
            {'label': 'Stay', 'value': 'STY'},
            {'label': 'Night Spent', 'value': 'NGT_SP'}
        ],
        value='LSTY',  
        multi=False,
        placeholder="Select Indicator"
    ),
    dcc.Dropdown(
        id='heatmap_year',
        options=[{'label': year, 'value': year} for year in long_df['Year'].unique()],
        value='2023',
        multi=False,
        placeholder="Select Year"
    ),
    dcc.Dropdown(
        id='heatmap_c_resid',
        options=[{'label': c_resid, 'value': c_resid} for c_resid in long_df['c_resid'].unique()],
        value='DOM',  
        multi=False,
        placeholder="Select Residency"
    ),
    dcc.Dropdown(
        id='heatmap_month',
        options=[{'label': month, 'value': month} for month in long_df['month'].unique()],
        value='M01',  
        multi=False,
        placeholder="Select Month"
    ),
    dcc.Dropdown(
        id='heatmap_geo_layer',
        options=[{'label': geo_layer, 'value': geo_layer} for geo_layer in long_df['geo_layer'].unique()],
        value=long_df['geo_layer'].unique()[0],
        multi=False,
        placeholder="Select geo_layer"
    ),    
    dcc.Graph(id='geo_heatmap'),  # Components for displaying heat maps
    # Add Top 10 Controls
    html.H2("Top 10 Popular Tourist Destinations"),
    dcc.Dropdown(
        id='top10_geo_layer',
        options=[{'label': geo_layer, 'value': geo_layer} for geo_layer in long_df['geo_layer'].unique()],
        value=long_df['geo_layer'].unique()[0],
        multi=False,
        placeholder="Select Geographical Layer"
    ),
    dcc.Dropdown(
        id='top10_year',
        options=[{'label': year, 'value': year} for year in long_df['Year'].unique()],
        value=long_df['Year'].unique()[0],
        multi=False,
        placeholder="Select Year"
    ),
    dcc.Dropdown(
        id='top10_c_resid',
        options=[{'label': c_resid, 'value': c_resid} for c_resid in long_df['c_resid'].unique()],
        value=long_df['c_resid'].unique()[0],
        multi=False,
        placeholder="Select Residency"
    ),
    dcc.Dropdown(
        id='top10_month',
        options=[{'label': month, 'value': month} for month in long_df['month'].unique()],
        value=long_df['month'].unique()[0],
        multi=False,
        placeholder="Select Month"
    ),
    dcc.Graph(id='top10_chart'),  # Graph for Top 10 Popular Tourist Destinations
    # Animated Chart Controls
    html.H2("Animated Change Over Time"),
    dcc.Dropdown(
        id='anim_geo_layer',
        options=[{'label': geo_layer, 'value': geo_layer} for geo_layer in long_df['geo_layer'].unique()],
        value=long_df['geo_layer'].unique()[0],
        multi=False,
        placeholder="Select Geographical Layer"
    ),
    dcc.Dropdown(
        id='anim_indic_to',
        options=[
            {'label': 'Length of Stay', 'value': 'LSTY'},
            {'label': 'Stay', 'value': 'STY'},
            {'label': 'Night Spent', 'value': 'NGT_SP'}
        ],
        value='LSTY', 
        multi=False,
        placeholder="Select Indicator"
    ),
    dcc.Graph(id='animated_chart'),  # Graph for animated changes over time
    html.H2("'Length of Stay' indicator Prediction"),
    html.Button('Forcast Length of Stay', id='predictLSTY-button', n_clicks=1),
    dcc.Graph(id='predictionLSTY-graph'),
    html.H2("'Stay' Indicator Prediction"),
    html.Button('Forcast Stay', id='predictSTY-button', n_clicks=1),
    dcc.Graph(id='predictionSTY-graph'),    
    html.H2("'Night Spent' Indicator Prediction"),
    html.Button('Forcast Night Spent', id='predictN-button', n_clicks=1),
    dcc.Graph(id='predictionN-graph'),     
])

#This part is code for prediction of tourism indicator "LSTY" by country level.
#Model is random forest regression
@app.callback(
    Output('predictionLSTY-graph', 'figure'),
    Input('predictLSTY-button', 'n_clicks')
)
def predict_LSTY(n_clicks):
    if n_clicks > 0:
        # Data pre-processing
        LSTY_summary_df = long_df.groupby(['month', 'indic_to', 'geo_layer', 'Year']).agg({'Value': 'sum'}).reset_index()
        LSTY_summary_df = LSTY_summary_df[LSTY_summary_df['indic_to'] == 'LSTY']
        LSTY_summary_df = LSTY_summary_df[LSTY_summary_df['geo_layer'] == 'Country']
        LSTY_summary_df.drop(columns=['indic_to', 'geo_layer'], errors='ignore', inplace=True)

        # Data pre-processing
        LSTY_summary_df['month_num'] = LSTY_summary_df['month'].apply(lambda x: int(x[1:]))  # 从 'M01' 转换为 1
        LSTY_summary_df['time'] = LSTY_summary_df['Year'].astype(str) + (LSTY_summary_df['month_num'] / 12.0).astype(str)  # 创建时间特征
        
        # Prepare input features and target values
        X = LSTY_summary_df[['Year', 'month_num']]
        y = LSTY_summary_df['Value']

        # Train the random forest model
        model = RandomForestRegressor(n_estimators=100)
        model.fit(X, y)

        # Create 2024 data
        future_year = 2024
        future_months = np.arange(1, 13)  #format transfer
        future_data = pd.DataFrame({'Year': future_year, 'month_num': future_months})

        # Make projections
        future_data['Predicted_Value'] = model.predict(future_data)

        # Visualization results
        combined_data = pd.concat([LSTY_summary_df, future_data[['Year', 'month_num', 'Predicted_Value']]], ignore_index=True)

        # Create line graphs
        fig = px.line(
            combined_data,
            x='month_num',
            y='Predicted_Value' if 'Predicted_Value' in combined_data.columns else 'Value',
            title='Predicted Total # Length of Stay per Month',
            labels={'month_num': 'Month', 'Predicted_Value': 'Total # Length of Stay', 'Value': 'Total # Length of Stay'},
            markers=True
        )

        # Generate month labels
        month_labels = [f'M{str(i).zfill(2)}' for i in range(1, 13)]
        # Set up custom x-axis scales and labels
        fig.update_xaxes(
            tickvals=np.arange(1, 13),
            ticktext=month_labels,
        )

        # Display of historical and forecast data
        fig.add_scatter(
            x=LSTY_summary_df['month_num'],
            y=LSTY_summary_df['Value'],
            mode='lines+markers',
            name='Historical Values',
            line=dict(color='blue')
        )

        # Show projected data as new series
        fig.add_scatter(
            x=future_data['month_num'],
            y=future_data['Predicted_Value'],
            mode='lines+markers',
            name='Predicted Values',
            line=dict(color='orange')
        )

        return fig

#This part is code for prediction of tourism indicator "STY" by country level.
#Model is random forest regression
@app.callback(
    Output('predictionSTY-graph', 'figure'),
    Input('predictSTY-button', 'n_clicks')
)
def predict_STY(n_clicks):
    if n_clicks > 0:
        # Data pre-processing
        STY_summary_df = long_df.groupby(['month', 'indic_to', 'geo_layer', 'Year']).agg({'Value': 'sum'}).reset_index()
        STY_summary_df = STY_summary_df[STY_summary_df['indic_to'] == 'STY']
        STY_summary_df = STY_summary_df[STY_summary_df['geo_layer'] == 'Country']
        STY_summary_df.drop(columns=['indic_to', 'geo_layer'], errors='ignore', inplace=True)

        # Data pre-processing
        STY_summary_df['month_num'] = STY_summary_df['month'].apply(lambda x: int(x[1:]))  # 从 'M01' 转换为 1
        STY_summary_df['time'] = STY_summary_df['Year'].astype(str) + (STY_summary_df['month_num'] / 12.0).astype(str)  # 创建时间特征
        
        # Prepare input features and target values
        X = STY_summary_df[['Year', 'month_num']]
        y = STY_summary_df['Value']

        # Train the random forest model
        model = RandomForestRegressor(n_estimators=100)
        model.fit(X, y)

        # Create 2024 data
        future_year = 2024
        future_months = np.arange(1, 13)  #format transfer
        future_data = pd.DataFrame({'Year': future_year, 'month_num': future_months})

        # Make projections
        future_data['Predicted_Value'] = model.predict(future_data)

        # Visualization results
        combined_data = pd.concat([STY_summary_df, future_data[['Year', 'month_num', 'Predicted_Value']]], ignore_index=True)

        # Create line graphs
        fig = px.line(
            combined_data,
            x='month_num',
            y='Predicted_Value' if 'Predicted_Value' in combined_data.columns else 'Value',
            title='Predicted Total # Length of Stay per Month',
            labels={'month_num': 'Month', 'Predicted_Value': 'Total # Length of Stay', 'Value': 'Total # Length of Stay'},
            markers=True
        )

        # Generate month labels
        month_labels = [f'M{str(i).zfill(2)}' for i in range(1, 13)]
        # Set up custom x-axis scales and labels
        fig.update_xaxes(
            tickvals=np.arange(1, 13),
            ticktext=month_labels,
        )

        # Display of historical and forecast data
        fig.add_scatter(
            x=STY_summary_df['month_num'],
            y=STY_summary_df['Value'],
            mode='lines+markers',
            name='Historical Values',
            line=dict(color='blue')
        )

        # Show projected data as new series
        fig.add_scatter(
            x=future_data['month_num'],
            y=future_data['Predicted_Value'],
            mode='lines+markers',
            name='Predicted Values',
            line=dict(color='orange')
        )

        return fig

#This part is code for prediction of tourism indicator "Night spent" by country level.
#Model is random forest regression
@app.callback(
    Output('predictionN-graph', 'figure'),
    Input('predictN-button', 'n_clicks')
)
def predict_NGT_SP(n_clicks):
    if n_clicks > 0:
        # Data pre-processing
        NGT_SP_summary_df = long_df.groupby(['month', 'indic_to', 'geo_layer', 'Year']).agg({'Value': 'sum'}).reset_index()
        NGT_SP_summary_df = NGT_SP_summary_df[NGT_SP_summary_df['indic_to'] == 'NGT_SP']
        NGT_SP_summary_df = NGT_SP_summary_df[NGT_SP_summary_df['geo_layer'] == 'Country']
        NGT_SP_summary_df.drop(columns=['indic_to', 'geo_layer'], errors='ignore', inplace=True)

        # Data pre-processing
        NGT_SP_summary_df['month_num'] = NGT_SP_summary_df['month'].apply(lambda x: int(x[1:]))  # 从 'M01' 转换为 1
        NGT_SP_summary_df['time'] = NGT_SP_summary_df['Year'].astype(str) + (NGT_SP_summary_df['month_num'] / 12.0).astype(str)  # 创建时间特征
        
        # Prepare input features and target values
        X = NGT_SP_summary_df[['Year', 'month_num']]
        y = NGT_SP_summary_df['Value']

        # Train the random forest model
        model = RandomForestRegressor(n_estimators=100)
        model.fit(X, y)

        # Create 2024 data
        future_year = 2024
        future_months = np.arange(1, 13)  #format transfer
        future_data = pd.DataFrame({'Year': future_year, 'month_num': future_months})

        # Make projections
        future_data['Predicted_Value'] = model.predict(future_data)

        # Visualization results
        combined_data = pd.concat([NGT_SP_summary_df, future_data[['Year', 'month_num', 'Predicted_Value']]], ignore_index=True)

        # Create line graphs
        fig = px.line(
            combined_data,
            x='month_num',
            y='Predicted_Value' if 'Predicted_Value' in combined_data.columns else 'Value',
            title='Predicted Total # Length of Stay per Month',
            labels={'month_num': 'Month', 'Predicted_Value': 'Total # Length of Stay', 'Value': 'Total # Length of Stay'},
            markers=True
        )

        # Generate month labels
        month_labels = [f'M{str(i).zfill(2)}' for i in range(1, 13)]
        # Set up custom x-axis scales and labels
        fig.update_xaxes(
            tickvals=np.arange(1, 13),
            ticktext=month_labels,
        )

        # Display of historical and forecast data
        fig.add_scatter(
            x=NGT_SP_summary_df['month_num'],
            y=NGT_SP_summary_df['Value'],
            mode='lines+markers',
            name='Historical Values',
            line=dict(color='blue')
        )

        # Show projected data as new series
        fig.add_scatter(
            x=future_data['month_num'],
            y=future_data['Predicted_Value'],
            mode='lines+markers',
            name='Predicted Values',
            line=dict(color='orange')
        )

        return fig    
@app.callback(
    Output('line-chart', 'figure'),
    [Input('geo_layer-dropdown', 'value'),
     Input('indic_to-dropdown', 'value'),
     Input('c_resid-dropdown', 'value')]
)
def update_line_chart(selected_geo_layer, selected_indic_to, selected_c_resid):
    if selected_c_resid == 'All':
        filtered_df = summary_df[
            (summary_df['geo_layer'] == selected_geo_layer) &
            (summary_df['indic_to'] == selected_indic_to)
        ].groupby(['month', 'Year']).sum().reset_index()
    else:
        filtered_df = summary_df[
            (summary_df['geo_layer'] == selected_geo_layer) &
            (summary_df['indic_to'] == selected_indic_to) &
            (summary_df['c_resid'] == selected_c_resid)
        ]
    
    fig = px.line(
        filtered_df, x='month', y='Value', color='Year', 
        title=f"Evolution of Short Stay over the Months",
        labels={'month': 'Month', 'Value': f'Total Number'}
    )
    return fig

@app.callback(
    Output('bar_chart', 'figure'),
    Input('indic_to_filter', 'value'),
    Input('year_filter', 'value'),
    Input('geo_layer_filter', 'value')
)
def update_bar_chart(selected_indicator, selected_year, selected_geo_layer):
    
    filtered_df = summary_df[
        (summary_df['indic_to'] == selected_indicator) &
        (summary_df['Year'] == selected_year) &
        (summary_df['geo_layer'] == selected_geo_layer)
    ].copy()
    # Update the labels in the DataFrame
    filtered_df['c_resid'] = filtered_df['c_resid'].replace({'FOR': 'Foreigners', 'DOM': 'Residents'})
    total_value = filtered_df['Value'].sum()
    filtered_df['Percentage'] = (filtered_df['Value'] / total_value * 100).round(2).astype(str) + '%'


        # Create a bar chart, using c_resid as the color classification
    fig = px.bar(
        filtered_df,
        x='month',
        y='Value',
        color='c_resid',
        title='% of Domestic and foreign tourists per month in a year',
        labels={'month': 'Month', 'Value': f'Total Number'},
        text='Percentage'  # Percentage displayed on bar
    )

    # Update display settings for data labels
    fig.update_traces(textposition='outside', textfont=dict(size=12))
    # Change the legend title
    fig.update_layout(legend_title_text='Type')
    return fig

@app.callback(
    Output('pie_chart', 'figure'),
    [Input('pie_chart_indic_to', 'value'),
     Input('pie_chart_geo_layer', 'value'),
     Input('pie_chart_year', 'value'),
     Input('pie_chart_month', 'value')]
)
def update_pie_chart(selected_indicator, selected_geo_layer, selected_year, selected_month):
    # Data filtering
    filtered_df = summary_df[
        (summary_df['indic_to'] == selected_indicator) &
        (summary_df['geo_layer'] == selected_geo_layer) &
        (summary_df['Year'] == selected_year)
    ].copy()

    # Check month selection
    if selected_month == 'All':  # If the choice is “All” 
        # Update the labels in the DataFrame
        filtered_df['c_resid'] = filtered_df['c_resid'].replace({'FOR': 'Foreigners', 'DOM': 'Residents'})
        # Group and sum all months
        filtered_df = filtered_df.groupby(['c_resid'], as_index=False).agg({'Value': 'sum'})
        
        
    else:
        # Filter only selected months
        filtered_df = filtered_df[filtered_df['month'] == selected_month]

    # Create pie charts
    pie_fig = px.pie(
        filtered_df,
        names='c_resid',
        values='Value',
        title=f'Short Stay Residents vs Foreigners in {selected_year}'
    )

    return pie_fig

# Load GeoJSON files
with open('/Users/mac/NUTS_RG_60M_2024_4326.geojson') as f:
    nuts_geojson = json.load(f)
@app.callback(
    Output('geo_heatmap', 'figure'),
    [Input('heatmap_indic_to', 'value'),
     Input('heatmap_year', 'value'),
     Input('heatmap_c_resid', 'value'),
     Input('heatmap_month', 'value'),
    Input('heatmap_geo_layer', 'value')]
)


def update_geo_heatmap(selected_indicator, selected_year, selected_c_resid, selected_month,selected_geo_layer):
    # Data filtering
    filtered_heatmap_df = long_df[
        (long_df['indic_to'] == selected_indicator) &
        (long_df['Year'] == selected_year) &
        (long_df['c_resid'] == selected_c_resid) &
        (long_df['month'] == selected_month)&
        (long_df['geo_layer'] == selected_geo_layer)
    ].copy()

    # If there is no data, you can return an empty chart or a hint
    if filtered_heatmap_df.empty:
        return px.choropleth()  # You can insert an empty chart, or create a text marking data not found

    # create plot
    fig = px.choropleth(
        filtered_heatmap_df,
        geojson=nuts_geojson,
        locations='geo',  # geographic identifiers, make sure this matches the regions supported by Plotly
        featureidkey='properties.NUTS_ID',
        color='Value',  # Values plotted
        hover_name='geo',  #Information displayed on mouse hover
        title=f'Concentration of Short Stay per country in {selected_month} {selected_year}',
        #color_continuous_scale=px.colors.sequential.Plasma  # Choose a color scheme
        color_continuous_scale=px.colors.sequential.Plasma[::-1]
    )
    # Change legend label
    fig.update_layout(coloraxis_colorbar_title='Total Number')
    return fig

# Add callback for Top 10 chart
@app.callback(
    Output('top10_chart', 'figure'),
    [Input('top10_geo_layer', 'value'),
     Input('top10_year', 'value'),
     Input('top10_c_resid', 'value'),
     Input('top10_month', 'value')]
)
def update_top10_chart(selected_geo_layer, selected_year, selected_c_resid, selected_month):
    filtered_top10_df = long_df[
        (long_df['geo_layer'] == selected_geo_layer) &
        (long_df['Year'] == selected_year) &
        (long_df['c_resid'] == selected_c_resid) &
        (long_df['month'] == selected_month)
    ]

    # Group and sum values for each geographical code
    top10 = filtered_top10_df.groupby('geo')['Value'].sum().reset_index()

    
    # Sort by Value and select top 10
    top10 = top10.sort_values(by='Value', ascending=False).head(10)
    # Create bar chart for top 10 destinations
    fig = px.bar(
        top10,
        x='Value',
        y='geo',
        title=f'Top 10 Tourist Destinations in {selected_month} {selected_year}',
        labels={'geo': 'Country/NUTS Code', 'Value': 'Total Number'},
        orientation='h',
        category_orders={'geo': top10['geo'].tolist()}
    )

    return fig


@app.callback(
    Output('animated_chart', 'figure'),
    [Input('anim_geo_layer', 'value'),
     Input('anim_indic_to', 'value')]
)
def update_animated_chart(selected_geo_layer, selected_indicator):
    filtered_anim_df = long_df[
        (long_df['geo_layer'] == selected_geo_layer) &
        (long_df['indic_to'] == selected_indicator)
    ]
    # Update the labels in the DataFrame
    filtered_anim_df['c_resid'] = filtered_anim_df['c_resid'].replace({'FOR': 'Foreigners', 'DOM': 'Residents'})
    # Aggregate monthly values by Year and c_resid
    monthly_sum = filtered_anim_df.groupby(['Year', 'month', 'c_resid'])['Value'].sum().reset_index()

    # Create an 'All' row by summing FOR and DOM
    all_sum = monthly_sum.groupby(['Year', 'month'])['Value'].sum().reset_index()
    all_sum['c_resid'] = 'All'  # Tag all categories as 'All'

    # Combine the data back to include FOR, DOM, and All
    combined_data = pd.concat([monthly_sum, all_sum], ignore_index=True)

    # Create the animated line chart
    fig = px.line(
        combined_data,
        x='Year',  # x-axis as Year
        y='Value',  # y-axis as aggregated monthly values
        animation_frame='month',  # Use month for animation
        color='c_resid',  # Color by residency category which includes 'All'
        title=f'Short Stay Dynamic Patterns Across Months and Years',
        labels={'Value': 'Total Numbers', 'Year': 'Year'},
        range_y=[0, combined_data['Value'].max() * 1.1]  # Adjust y-axis range for better visibility
    )
    # Change the legend title to "Type"
    fig.update_layout(legend_title=dict(text='Type'))
    return fig

if __name__ == '__main__':
    app.run_server(debug=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

