In [1]:
import pandas as pd
import eurostat

# Import data using the Eurostat API
data = eurostat.get_data_df('TOUR_CE_OMN12')

#Drop columns'freq','unit'
data.drop(columns=['freq','unit'], errors='ignore', inplace=True)

#add column geo_layer to label geographic regions (NUTS codes).
df = pd.DataFrame(data)
df['geo_layer'] = df['geo\TIME_PERIOD'].apply(lambda x: 'NUT1' if len(x) == 3 else ('NUT2' if len(x) == 4 else ('Country' if len(x) == 2 else '[EU27_2020]')))
#put column "geo" besides column "geo_layer" 
col_order = ['geo\TIME_PERIOD', 'geo_layer'] + [col for col in df.columns if col not in ['geo\TIME_PERIOD', 'geo_layer']]
df = df[col_order]

#Drop row where 'c_resid' and 'month' value is total
df = df[df['c_resid'] != 'TOTAL']
df = df[df['month'] != 'TOTAL']

#Change colomn label for 'geo\TIME_PERIOD' to 'geo'
df.columns.values[df.columns.get_loc('geo\TIME_PERIOD')] = 'geo'

long_df = pd.melt(df, id_vars=['geo', 'geo_layer', 'indic_to', 'c_resid', 'month'], value_vars=['2018', '2019', '2020', '2021', '2022', '2023', '2024'], var_name='Year', value_name='Value') 



In [2]:
#check all parameter in dataset
import eurostat
pars = eurostat.get_pars('TOUR_CE_OMN12')
pars

['freq', 'indic_to', 'c_resid', 'month', 'unit', 'geo']

In [3]:
#check whether there's special character like space in the value of dataset

print("Unique values in 'indic_to' column:", data['indic_to'].unique())
print("Unique values in 'c_resid' column:", data['c_resid'].unique())
print("Unique values in 'month' column:", data['month'].unique())
print("Unique values in 'TIME_PERIOD' column:", data['geo\TIME_PERIOD'].unique())


Unique values in 'indic_to' column: ['LSTY' 'NGT_SP' 'STY']
Unique values in 'c_resid' column: ['DOM' 'FOR' 'TOTAL']
Unique values in 'month' column: ['M01' 'M02' 'M03' 'M04' 'M05' 'M06' 'M07' 'M08' 'M09' 'M10' 'M11' 'M12'
 'TOTAL']
Unique values in 'TIME_PERIOD' column: ['AT' 'AT1' 'AT11' 'AT12' 'AT13' 'AT2' 'AT21' 'AT22' 'AT3' 'AT31' 'AT32'
 'AT33' 'AT34' 'BE' 'BE1' 'BE10' 'BE2' 'BE21' 'BE22' 'BE23' 'BE24' 'BE25'
 'BE3' 'BE31' 'BE32' 'BE33' 'BE34' 'BE35' 'BG' 'BG3' 'BG31' 'BG32' 'BG33'
 'BG34' 'BG4' 'BG41' 'BG42' 'CH' 'CH0' 'CH01' 'CH02' 'CH03' 'CH04' 'CH05'
 'CH06' 'CH07' 'CY' 'CY0' 'CY00' 'CZ' 'CZ0' 'CZ01' 'CZ02' 'CZ03' 'CZ04'
 'CZ05' 'CZ06' 'CZ07' 'CZ08' 'DE' 'DE1' 'DE11' 'DE12' 'DE13' 'DE14' 'DE2'
 'DE21' 'DE22' 'DE23' 'DE24' 'DE25' 'DE26' 'DE27' 'DE3' 'DE30' 'DE4'
 'DE40' 'DE5' 'DE50' 'DE6' 'DE60' 'DE7' 'DE71' 'DE72' 'DE73' 'DE8' 'DE80'
 'DE9' 'DE91' 'DE92' 'DE93' 'DE94' 'DEA' 'DEA1' 'DEA2' 'DEA3' 'DEA4'
 'DEA5' 'DEB' 'DEB1' 'DEB2' 'DEB3' 'DEC' 'DEC0' 'DED' 'DED2' 'DED4' 'DED5'


In [4]:
#Check null value amount for each column
null_counts = long_df.isnull().sum()
print(null_counts)

geo              0
geo_layer        0
indic_to         0
c_resid          0
month            0
Year             0
Value        20886
dtype: int64


In [5]:
#Analyse null value distribution
#most of null value are in 2024,so we will not cover 2024 for later analyse
#For year 2018-2024,the null value amount is small, so we will drop it for later analyse

rows_2024 = long_df[long_df['Year'] == '2024']
rows_2023 = long_df[long_df['Year'] == '2023']
rows_2022 = long_df[long_df['Year'] == '2022']
rows_2021 = long_df[long_df['Year'] == '2021']
rows_2020 = long_df[long_df['Year'] == '2020']
rows_2019 = long_df[long_df['Year'] == '2019']
rows_2018 = long_df[long_df['Year'] == '2018']
missing_value_count_2024 = rows_2024['Value'].isna().sum()
missing_value_count_2023 = rows_2023['Value'].isna().sum()
missing_value_count_2022 = rows_2022['Value'].isna().sum()
missing_value_count_2021 = rows_2021['Value'].isna().sum()
missing_value_count_2020 = rows_2020['Value'].isna().sum()
missing_value_count_2019 = rows_2019['Value'].isna().sum()
missing_value_count_2018 = rows_2018['Value'].isna().sum()
print(f"Number of rows where Value is NaN for Year 2024: {missing_value_count_2024}")
print(f"Number of rows where Value is NaN for Year 2023: {missing_value_count_2023}")
print(f"Number of rows where Value is NaN for Year 2022: {missing_value_count_2022}")
print(f"Number of rows where Value is NaN for Year 2021: {missing_value_count_2021}")
print(f"Number of rows where Value is NaN for Year 2020: {missing_value_count_2020}")
print(f"Number of rows where Value is NaN for Year 2019: {missing_value_count_2019}")
print(f"Number of rows where Value is NaN for Year 2018: {missing_value_count_2018}")

Number of rows where Value is NaN for Year 2024: 20790
Number of rows where Value is NaN for Year 2023: 9
Number of rows where Value is NaN for Year 2022: 9
Number of rows where Value is NaN for Year 2021: 21
Number of rows where Value is NaN for Year 2020: 15
Number of rows where Value is NaN for Year 2019: 21
Number of rows where Value is NaN for Year 2018: 21


In [6]:
# Drop rows with any missing values
long_df.dropna(inplace=True)
# Drop rows where Year is 2024
long_df = long_df[long_df['Year'] != '2024']
long_file = 'long_df_flask.csv'
long_df.to_csv(long_file, index=False)
long_df

Unnamed: 0,geo,geo_layer,indic_to,c_resid,month,Year,Value
0,AT,Country,LSTY,DOM,M01,2018,23783.0
1,AT1,NUT1,LSTY,DOM,M01,2018,8096.0
2,AT11,NUT2,LSTY,DOM,M01,2018,239.0
3,AT12,NUT2,LSTY,DOM,M01,2018,790.0
4,AT13,NUT2,LSTY,DOM,M01,2018,7067.0
...,...,...,...,...,...,...,...
166315,SK0,NUT1,STY,FOR,M12,2023,18620.0
166316,SK01,NUT2,STY,FOR,M12,2023,8896.0
166317,SK02,NUT2,STY,FOR,M12,2023,1296.0
166318,SK03,NUT2,STY,FOR,M12,2023,4891.0


In [7]:
#Summarize data by month, guest residence, or geo_layer
summary_df = long_df.groupby(['month', 'c_resid', 'indic_to', 'geo_layer','Year']).agg({
    'Value': 'sum',
}).reset_index()
#summary_file = 'summary_df_flask.csv'
#summary_df.to_csv(summary_file, index=False)
summary_df

Unnamed: 0,month,c_resid,indic_to,geo_layer,Year,Value
0,M01,DOM,LSTY,Country,2018,2076982.0
1,M01,DOM,LSTY,Country,2019,2517764.0
2,M01,DOM,LSTY,Country,2020,3342210.0
3,M01,DOM,LSTY,Country,2021,2425589.0
4,M01,DOM,LSTY,Country,2022,4108208.0
...,...,...,...,...,...,...
1723,M12,FOR,STY,[EU27_2020],2019,1563624.0
1724,M12,FOR,STY,[EU27_2020],2020,171960.0
1725,M12,FOR,STY,[EU27_2020],2021,893514.0
1726,M12,FOR,STY,[EU27_2020],2022,1548701.0


In [11]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px



# 数据预处理
summary_df['month_num'] = summary_df['month'].apply(lambda x: int(x[1:]))  # 从'M01'到1
summary_df['time'] = summary_df['Year'] + summary_df['month_num'] / 12.0  # 创建时间特征

# 准备输入特征和目标值
X = summary_df[['Year', 'month_num']]
y = summary_df['Value']

# 训练随机森林模型
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)

# 创建2024年的数据
future_year = 2024
future_months = np.arange(1, 13)  # 从1到12的月份
future_data = pd.DataFrame({'Year': future_year, 'month_num': future_months})

# 进行预测
future_data['Predicted_Value'] = model.predict(future_data)

# 可视化结果
combined_data = pd.concat([summary_df, future_data[['Year', 'month_num', 'Predicted_Value']]], ignore_index=True)

# 创建折线图
fig = px.line(
    combined_data,
    x='month_num',
    y='Predicted_Value' if 'Predicted_Value' in combined_data.columns else 'Value',
    title='Yearly Value Predictions with Random Forest Regression',
    labels={'Value': 'Accommodation Demand'},
    markers=True
)

# 显示历史数据与预测数据
fig.add_scatter(
    x=summary_df['month_num'],
    y=summary_df['Value'],
    mode='lines+markers',
    name='Historical Values',
    line=dict(color='blue')
)

# 显示预测数据为新系列
fig.add_scatter(
    x=future_data['month_num'],
    y=future_data['Predicted_Value'],
    mode='lines+markers',
    name='Predicted Values',
    line=dict(color='orange')
)

# 显示图形
fig.show()

TypeError: can only concatenate str (not "float") to str