# Этап 1 - первичная оценка исторических данных и предобработка

## 1.1 Импорт библиотек и выгрузка данных

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import datetime as dt
from statsmodels.tsa.stattools import adfuller
from prophet import Prophet
from sklearn.metrics import r2_score

In [26]:
df=pd.read_csv('travel_content_historic_data.csv')
df['dates']=pd.to_datetime(df.dates)

In [27]:
df

Unnamed: 0,dates,Revenue
0,2020-01-01,263.548353
1,2020-01-02,339.025061
2,2020-01-03,308.022741
3,2020-01-04,329.983017
4,2020-01-05,363.065365
...,...,...
1091,2022-12-27,1494.658147
1092,2022-12-28,1568.942676
1093,2022-12-29,1569.861364
1094,2022-12-30,1598.670921


In [28]:
df.describe()

Unnamed: 0,dates,Revenue
count,1096,1096.0
mean,2021-07-01 12:00:00,957.132906
min,2020-01-01 00:00:00,235.050332
25%,2020-09-30 18:00:00,613.4536
50%,2021-07-01 12:00:00,939.818931
75%,2022-04-01 06:00:00,1287.847688
max,2022-12-31 00:00:00,10000.0
std,,461.551507


## 1.2 Удаление выбросов

In [29]:
fig=px.line(df,x='dates',y='Revenue',title='Продажи на каждый день')
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [30]:
df.isna().sum()

dates      0
Revenue    0
dtype: int64

In [31]:
q1, q3 = np.percentile(df['Revenue'], [25, 75])
iqr = q3-q1
lower_end= q1 - (iqr*1.3)
upper_end= q3 + (iqr*1.3)
lower_end,upper_end

(-263.25871288750034, 2164.5600007625003)

In [34]:
df['rolling_mean'] = df['Revenue'].rolling(window=7,min_periods=1,center=False).mean()
df['Adjusted_Revenue'] = np.where((df['Revenue']<lower_end)|(df['Revenue']>upper_end),df['rolling_mean'].shift(1),df['Revenue'])
del df['rolling_mean']
df

Unnamed: 0,dates,Revenue,Adjusted_Revenue
0,2020-01-01,263.548353,263.548353
1,2020-01-02,339.025061,339.025061
2,2020-01-03,308.022741,308.022741
3,2020-01-04,329.983017,329.983017
4,2020-01-05,363.065365,363.065365
...,...,...,...
1091,2022-12-27,1494.658147,1494.658147
1092,2022-12-28,1568.942676,1568.942676
1093,2022-12-29,1569.861364,1569.861364
1094,2022-12-30,1598.670921,1598.670921


In [36]:
fig=px.line(df,x='dates',y=['Revenue','Adjusted_Revenue'],title='Продажи на каждый день')
fig.show()


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [38]:
del df['Revenue']

# Этап 2 - Детальная оценка данных

In [39]:
df_2022 = df[df['dates'] >= '2022-01-01']
df_2022

Unnamed: 0,dates,Adjusted_Revenue
731,2022-01-01,1258.676576
732,2022-01-02,1204.675530
733,2022-01-03,1102.977938
734,2022-01-04,1153.512676
735,2022-01-05,1111.687751
...,...,...
1091,2022-12-27,1494.658147
1092,2022-12-28,1568.942676
1093,2022-12-29,1569.861364
1094,2022-12-30,1598.670921


In [41]:
avg_revenue_2022=df_2022.Adjusted_Revenue.mean()
avg_revenue_2022

1378.1355727835617

In [42]:
df_2022_december=df[df['dates'] >= '2022-12-01']
df_2022_december

Unnamed: 0,dates,Adjusted_Revenue
1065,2022-12-01,1538.887842
1066,2022-12-02,1575.710501
1067,2022-12-03,1620.124879
1068,2022-12-04,1633.763512
1069,2022-12-05,1457.867059
1070,2022-12-06,1499.570466
1071,2022-12-07,1564.900109
1072,2022-12-08,1588.952212
1073,2022-12-09,1598.022183
1074,2022-12-10,1599.931513


In [43]:
avg_revenue_2022_december=df_2022_december.Adjusted_Revenue.mean()
avg_revenue_2022_december

1579.2193224516127

# Этап 3 - Проверка на стационарность и выбор модели прогнозирования

In [45]:
result=adfuller(df['Adjusted_Revenue'])
if result[1]>0.05:
    print(f'P-value: {result[1]} - Ряд нестационарный')
else:
    print(f'P-value: {result[1]} - Ряд стационарный')

P-value: 0.9584703076614821 - Ряд нестационарный


# Этап 4 - Построение проверочного прогноза

In [46]:
df = df.rename(columns = {'dates':'ds', 'Adjusted_Revenue':'y'})

In [47]:
index=int(df.shape[0]*0.3)
train, test= df[:-index],df[-index:]

In [48]:
model=Prophet()
model.fit(train)

predictions_period=model.make_future_dataframe(periods=index)
predictions_period=predictions_period[-index:]

forecast=model.predict(predictions_period)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

r2 = r2_score(test['y'],forecast['yhat'])

print(f'R²: {r2}')

08:05:12 - cmdstanpy - INFO - Chain [1] start processing
08:05:12 - cmdstanpy - INFO - Chain [1] done processing


R²: 0.9561649425531948


# Этап 5 - Построение прогноза на период акции

In [49]:
model_2=Prophet()
model_2.fit(df)

predictions_period=model_2.make_future_dataframe(periods=92)

forecast=model_2.predict(predictions_period)
forecast=forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
prediction=forecast[forecast['ds']>='2023-01-01']
prediction

08:05:22 - cmdstanpy - INFO - Chain [1] start processing
08:05:22 - cmdstanpy - INFO - Chain [1] done processing


Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
1096,2023-01-01,1675.977324,1646.869046,1704.175403
1097,2023-01-02,1534.605226,1505.831181,1565.406922
1098,2023-01-03,1558.137535,1528.058076,1590.487620
1099,2023-01-04,1582.372786,1551.892944,1611.160922
1100,2023-01-05,1606.994634,1578.543060,1636.924939
...,...,...,...,...
1183,2023-03-29,1703.445248,1670.633690,1731.971961
1184,2023-03-30,1728.258467,1697.598389,1760.609254
1185,2023-03-31,1750.338965,1722.455975,1779.107847
1186,2023-04-01,1777.432770,1747.284525,1806.698653


# Этап 6 - Сравнение прогнозируемых данных с реальными

In [50]:
df_promo=pd.read_csv('travel_content_data.csv')
df_promo

Unnamed: 0,Dates,Revenue
0,2023-01-01,2012
1,2023-01-02,1841
2,2023-01-03,1869
3,2023-01-04,1899
4,2023-01-05,1928
...,...,...
87,2023-03-29,2044
88,2023-03-30,2074
89,2023-03-31,2100
90,2023-04-01,2133


In [52]:
print(prediction['yhat_upper'].sum())
print(df_promo['Revenue'].sum())

156382.67288753236
184435


## Вывод: акция положительно повлияла на выручку