In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('TimeSeries.csv')

In [None]:
df.head()

In [None]:
df['Date'] = pd.to_datetime(df[['Year', 'Month']].assign(DAY=1))

In [None]:
df.drop(['Year','Month'],axis=1,inplace=True)

In [None]:
df

In [None]:
# find data type of columns
df.dtypes

In [None]:
# find the total null values
df.isnull().sum()

In [None]:
df.columns

In [None]:
# remove the space before column name
df.rename(columns = {' Sales':'Sales'}, inplace = True)

In [None]:
df.columns

In [None]:
df

In [None]:
# Convert the "Sales" column to numeric
df["Sales"] = pd.to_numeric(df["Sales"].str.replace(',', ''))

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.set_index('Date',inplace=True)

In [None]:
df.tail()

In [None]:
df.describe(percentiles=[.25,.5,.75,.90,.95,.99])

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df.isnull().sum()

# Visualize the data

In [None]:
df.plot()

In [None]:
plt.figure(figsize=(13,9))
sns.boxplot(df['Sales'])
plt.show()

#### Inference: We can clearly see that there are outliers in the data.

We will keep the outliers as they are. We cannot delete them as this is a time series data and deleting any data point will lead to discontinuity in the data.


## Use time series analysis and decomposition techniques to identify the trend, seasonal, and residual components in the sales data.


In [None]:
df1 =df

In [None]:
df1

In [None]:
df1.index.dtype

In [None]:
plt.figure(figsize=(18,4))
plt.plot(df1, label='Sales')
plt.legend(loc='best')
plt.xticks(rotation = 90,fontweight="bold")
plt.show()

### Additive seasonal decomposition

In [None]:
from pylab import rcParams
import statsmodels.api as sm
rcParams['figure.figsize'] = 12, 8
decomposition = sm.tsa.seasonal_decompose(df1.Sales, model='additive') # additive seasonal index
fig = decomposition.plot()
plt.show()

### Multiplicative seasonal decomposition

In [None]:
decomposition = sm.tsa.seasonal_decompose(df1.Sales, model='multiplicative') # multiplicative seasonal index
fig = decomposition.plot()
plt.show()

#### Inference : The data contains level, trend and seasonality.

## Conduct a hypothesis test to determine whether the sales data contains any seasonal dependencies.

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
test_results = adfuller(df['Sales'])

In [None]:
def adfuller_test(sales):
    result = adfuller(sales)
    labels = ['ADF Test Statistic','p-value' , '#Lags used', 'Number of observations used']
    for value,label in zip(result,labels):
        print(label+' : '+str(value))
    if result[1] <= 0.05:
        print("strong evidence against the null hypothesis(Ho), reject the null hypothesis. Data has no unit root and is stationary")
    else:
        print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")
    

In [None]:
adfuller_test(df['Sales'])

## Dickey Fuller Test

In [None]:
# Perform the Dickey-Fuller test
result = adfuller(df['Sales'])

# Extract the test statistic, p-value, and critical values
test_statistic = result[0]
p_value = result[1]
critical_values = result[4]



In [None]:
# Print the results of the test
alpha = 0.05
if p_value < alpha:
    print(f"The p-value ({p_value:.4f}) is less than the significance level ({alpha}), "
          "so we reject the null hypothesis that the time series is non-stationary and conclude that the data is stationary.")
else:
    print(f"The p-value ({p_value:.4f}) is greater than the significance level ({alpha}), "
          "so we fail to reject the null hypothesis that the time series is non-stationary and conclude that the data is non-stationary.")
    


In [None]:
print(f"Test statistic: {test_statistic:.4f}")
print("Critical values:")
for key, value in critical_values.items():
    print(f"\t{key}: {value:.4f}")

### If non-stationary, convert it into stationary!

## Differencing

In [None]:
df['Sales First Difference'] = df['Sales'] - df['Sales'].shift(1)

In [None]:
df['Seasonal First Difference']=df['Sales']-df['Sales'].shift(12)

In [None]:
df.head(14)

In [None]:
## Again test dickey fuller test
adfuller_test(df['Seasonal First Difference'].dropna())

In [None]:
df['Seasonal First Difference'].plot()

# Auto Regressive Model

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df['Sales'])
plt.show()

Final Thoughts on Autocorrelation and Partial Autocorrelation
Identification of an AR model is often best done with the PACF.

For an AR model, the theoretical PACF “shuts off” past the order of the model. The phrase “shuts off” means that in theory the partial autocorrelations are equal to 0 beyond that point. Put another way, the number of non-zero partial autocorrelations gives the order of the AR model. By the “order of the model” we mean the most extreme lag of x that is used as a predictor.
Identification of an MA model is often best done with the ACF rather than the PACF.

For an MA model, the theoretical PACF does not shut off, but instead tapers toward 0 in some manner. A clearer pattern for an MA model is in the ACF. The ACF will have non-zero autocorrelations only at lags involved in the model.
p,d,q p AR model lags d differencing q MA lags

In [None]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(df['Seasonal First Difference'].iloc[13:],lags=40,ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(df['Seasonal First Difference'].iloc[13:],lags=40,ax=ax2)

# SARIMA

In [None]:
import statsmodels.api as sm


In [None]:

model=sm.tsa.statespace.SARIMAX(df['Sales'],order=(1, 1, 1),seasonal_order=(1,1,1,12))
results=model.fit()

In [None]:

df['forecast']=results.predict(start=140,end=154,dynamic=True)
df[['Sales','forecast']].plot(figsize=(12,8))

In [None]:
from pandas.tseries.offsets import DateOffset
#  3 months
future_dates=[df.index[-1]+ DateOffset(months=x)for x in range(0,4)]

In [None]:
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df.columns)

In [None]:
future_datest_df.tail()

In [None]:
future_df=pd.concat([df1,future_datest_df])

In [None]:
future_df['forecast'] = results.predict(start = 151, end = 157, dynamic= True)  
future_df[['Sales', 'forecast']].plot(figsize=(12, 8)) 

### If seasonal patterns are present, apply advanced time series forecasting techniques such as the seasonal ARIMA model or exponential smoothing or xgboost, prophet, to capture the seasonal effects in the data.

## Don't know what to do next!

# Random Forest