In [110]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Objective:
## 1) To get the modal price of onion for each month for the Mumbai market 
## 2) To build time series model and check the performance of the model using RMSE
## 3) To plot ACF and PACF plots
## 4) To perform Exponential smoothing using Holt-Winter’s technique and to forecast onion price for Mumbai market

In [111]:
#import required basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [15,20]
import itertools

In [112]:
filename="/kaggle/input/market-price-of-onion-2020/Onion Prices 2020.csv"
df = pd.read_csv(filename)
df.head()

In [113]:
df.shape

In [114]:
df.info()

In [115]:
# Changing the date column to a Time Interval columnn
df.arrival_date = pd.DatetimeIndex(df.arrival_date)
df.dtypes

In [116]:
pd.unique(df.district), pd.unique(df.state), pd.unique(df.market)

In [117]:
# change the index to date column
df.set_index('arrival_date', inplace=True)
#df.index = pd.PeriodIndex(df.arrival_date, freq="M")
df.head()

In [118]:
df = df.sort_values(by="arrival_date")
df.head()

In [119]:
df2020_mum = df[(df.district =="Mumbai")]
df2020_mum.head()

In [120]:
df2020_mum.shape

In [121]:
# for easy undersatnding the numbers in describe function, we set precision to 0
pd.set_option("precision", 0)
df2020_mum.describe()

In [122]:
df2020_mum.index

In [123]:
df2020_mum = df2020_mum.drop(["state","district","market","commodity","variety","min_price","max_price"], axis=1)
df2020_mum.head()

In [124]:
# Decompose the data frame to get the trend, seasonality and noise
from statsmodels.tsa.seasonal import seasonal_decompose
decompose_result = seasonal_decompose(df2020_mum['modal_price'],model='multiplicative',period=1)
decompose_result.plot()
plt.show()

# Modal price of onion for each month for the Mumbai market 

In [125]:
df2020_mum.plot(kind="line", y=["modal_price"],figsize=[30,5])
plt.show()

In [126]:
#Log-transformations can help to stabilize the variance of a time series
df2020_mum.modal_price.plot(kind="hist", bins=20)

In [127]:
df2020_mum["log_modal_price"] = np.log(df2020_mum.modal_price)
df2020_mum.log_modal_price.plot(kind="hist", bins=20)

In [128]:
df2020_mum.log_modal_price.plot(figsize=[30,5])

# Building a time series model

In [129]:
df2020_mum.head()

In [130]:
x=df2020_mum.index.copy()

In [131]:
df2020_mum.date = x

In [132]:
df2020_mum.head()

In [133]:
# Converting the date into datetinme delta starting from 0
df2020_mum["timeindex"] = df2020_mum.date - df2020_mum.date.min()
df2020_mum.head()

In [134]:
df2020_mum.info()

In [135]:
# converting the timeindex into months using timedelta & then rounding it 
df2020_mum["timeindex"] = df2020_mum["timeindex"]/np.timedelta64(1,"D")
df2020_mum.head()

In [136]:
df2020_mum["timeindex"] = df2020_mum["timeindex"].round(0).astype(int)
df2020_mum

In [137]:
import statsmodels.formula.api as smf
linear_model = smf.ols('log_modal_price ~ timeindex', data = df2020_mum).fit()
linear_model.summary()

In [138]:
linear_model_pred = linear_model.predict()
linear_model_pred

In [139]:
df2020_mum.plot(kind = "line", x="timeindex", y="log_modal_price", figsize=[20,5])
plt.plot(df2020_mum.timeindex, linear_model_pred)

In [140]:
linear_model.resid.plot(kind="bar", figsize=[20,5])

# Evaluate the model using RMSE

In [141]:
df2020_mum["linear_price"] = np.exp(linear_model_pred)
df2020_mum.head()

In [142]:
df2020_mum_dummy= df2020_mum.set_index('timeindex')
df2020_mum_dummy.head()

In [143]:
df2020_mum_dummy.modal_price.plot(figsize=[30,5])

In [144]:
df2020_mum_dummy.log_modal_price.plot(figsize=[30,5])

In [145]:
def RMSE(actual, predicted):
    mse = (actual - predicted)**2
    rmse = np.sqrt(mse.sum()/mse.count())
    return rmse

In [146]:
linear_model_RMSE = RMSE(df2020_mum.modal_price, df2020_mum.linear_price)
linear_model_RMSE

# Plotting ACF & PACF Plots

In [147]:
df2020_mum.head()

## ACF & PACF for Modal_price

Adfuller test to check if the data is stationary, to perform ACF

In [148]:
from statsmodels.tsa.stattools import adfuller
def adf_test(series):
    result=adfuller(series)
    print('ADF Statistics: {}'.format(result[0]))
    print('p- value: {}'.format(result[1]))
    if result[1] <= 0.05:
        print("strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root and is stationary")
    else:
        print("weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")

In [149]:
adf_test(df2020_mum['modal_price'])

In [150]:
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
acf = plot_acf(df2020_mum["modal_price"].dropna())

In [151]:
pacf = plot_pacf(df2020_mum["modal_price"].dropna())

## ACF & PACF for log_modal_price

In [152]:
adf_test(df2020_mum['log_modal_price'])

In [153]:
acf = plot_acf(df2020_mum["log_modal_price"].dropna())

In [154]:
pacf = plot_pacf(df2020_mum["log_modal_price"].dropna())

# performing exponential smoothing using Holt-Winter’s technique

In [155]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [156]:
# Set the value of Alpha and define x as the time period
x = 12
alpha = 1/(2*x)

In [157]:
df2020_mum['HWES2_ADD'] = ExponentialSmoothing(df2020_mum['modal_price'],trend='add').fit().fittedvalues
df2020_mum['HWES2_MUL'] = ExponentialSmoothing(df2020_mum['modal_price'],trend='mul').fit().fittedvalues
df2020_mum[['modal_price','HWES2_ADD','HWES2_MUL']].plot(title='Holt Winters graph: Additive Trend and Multiplicative Trend',figsize=[30,5])

In [158]:
df2020_mum['HWES2_ADD'] = ExponentialSmoothing(df2020_mum['log_modal_price'],trend='add').fit().fittedvalues
df2020_mum['HWES2_MUL'] = ExponentialSmoothing(df2020_mum['log_modal_price'],trend='mul').fit().fittedvalues
df2020_mum[['log_modal_price','HWES2_ADD','HWES2_MUL']].plot(title='Holt Winters graph: Additive Trend and Multiplicative Trend',figsize=[30,5])

# Forecasting

In [159]:
df2020_mum.head(2)

In [160]:
df2020_mum=df2020_mum.drop(['log_modal_price','timeindex','linear_price','HWES2_ADD','HWES2_MUL'],axis=1)

In [161]:
df2020_mum

In [183]:
# Split into train and test set
train_df2020_mum =df2020_mum[:-25]
test_df2020_mum = df2020_mum[-25:]

In [184]:
model = ExponentialSmoothing(train_df2020_mum, seasonal_periods=25, trend='add', seasonal='mul')
fitted = model.fit()

In [185]:
sales_forecast = fitted.forecast(steps=25)

In [186]:
fig = plt.figure()
fig.suptitle('Modal price of Onion')
past, = plt.plot(train_df2020_mum.index, train_df2020_mum, 'b.-', label='Sales History')
future, = plt.plot(test_df2020_mum.index, test_df2020_mum, 'r.-', label='Actual Sales')
predicted_future, = plt.plot(test_df2020_mum.index, sales_forecast, 'g.-', label='Sales Forecast')
plt.legend(handles=[past, future, predicted_future])
plt.show()