<a href="https://colab.research.google.com/github/bhuvaneshkj/Finops-Cloud-Tool/blob/Dev_Bhuvanesh/CapstoneResourcePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.display import clear_output
try:
  !pip install pystan
  !pip install --upgrade git+https://github.com/jroakes/google-analytics.git
  !pip install fbprophet
  !pip install xgboost
except:
  pass
finally:
  clear_output()
  print('All Loaded')

In [None]:
import pandas as pd
import datetime

In [None]:
from fbprophet import Prophet
import seaborn as sns


In [None]:
df=  pd.read_csv('/content/ProcessedData.csv')
df.head()

In [None]:
df.describe()

In [None]:
df = df.set_index("Time")
ax = df['CPU usage [%]'].plot(figsize = (16,5), title = "CPU % Utilization")
ax.set(xlabel='Dates', ylabel='CPU usage [%]');

In [None]:
df_pr_index = df.reset_index()
df_pr_index.head()
df_pr_index

In [None]:
df['cap'] = 8.5
df_pr = df_pr_index[['Time','CPU usage [%]']]

df_pr.columns = ['ds','y'] # To use prophet column names should be like that
train_data_pr = df_pr.iloc[:len(df)-20000]
test_data_pr = df_pr.iloc[len(df)-20000:]
m = Prophet()
m.fit(train_data_pr)
future = m.make_future_dataframe(periods=3,freq='MS')
prophet_pred = m.predict(future)
prophet_pred.tail()

In [None]:
prophet_pred.head()

In [None]:
fig1 = m.plot(prophet_pred)

In [None]:
from fbprophet.plot import plot_plotly, plot_components_plotly

plot_plotly(m, prophet_pred)


In [None]:
m.plot_components(prophet_pred)

In [None]:
from sklearn.metrics import mean_absolute_error
#mean_absolute_error(float(test_data_pr.iloc[len(test_data_pr)-5223:]), prophet_pred.yhat)

In [None]:

# xgboost
import xgboost as xgb
import numpy as np
print("xgboost", xgb.__version__)

In [None]:
model = xgb.XGBRegressor()

In [None]:

# transform a time series dataset into a supervised learning dataset
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols = list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
	# put it all together
	agg = pd.concat(cols, axis=1)
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg.values



In [None]:

# walk-forward validation for univariate data
def walk_forward_validation(data, n_test):
	predictions = list()
	# split dataset
	train, test = train_test_split(data, n_test)
	# seed history with training dataset
	history = [x for x in train]
	# step over each time-step in the test set
	for i in range(len(test)):
		# split test row into input and output columns
		testX, testy = test[i, :-1], test[i, -1]
		# fit model on history and make a prediction
		yhat = xgboost_forecast(history, testX)
		# store forecast in list of predictions
		predictions.append(yhat)
		# add actual observation to history for the next loop
		history.append(test[i])
		# summarize progress
		print('>expected=%.1f, predicted=%.1f' % (testy, yhat))
	# estimate prediction error
	error = mean_absolute_error(test[:, -1], predictions)
	return error, test[:, 1], predictions

In [None]:

# split a univariate dataset into train/test sets
def train_test_split(data, n_test):
	return data[:-n_test, :], data[-n_test:, :]


In [None]:

# fit an xgboost model and make a one step prediction
def xgboost_forecast(train, testX):
	# transform list into array
	train = np.array(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict([testX])
	return yhat[0]

In [None]:

values = df1.values
# transform the time series data into supervised learning
data = series_to_supervised(values, n_in=6)
# evaluate
mae, y, yhat = walk_forward_validation(data, 12)
print('MAE: %.3f' % mae)
# plot expected vs preducted


In [None]:
import matplotlib.pyplot as plt
plt.plot(y, label='Expected')
plt.plot(yhat, label='Predicted')
plt.legend()
plt.show()

 **Stationarity of a Time Series**

A TS is said to be stationary if its statistical properties such as mean, variance remain constant over time. But why is it important? Most of the TS models work on the assumption that the TS is stationary. Intuitively, we can sat that if a TS has a particular behaviour over time, there is a very high probability that it will follow the same in the future. Also, the theories related to stationary series are more mature and easier to implement as compared to non-stationary series

In [None]:
df1=df[['Timestamp [ms]','CPU usage [%]']]

In [None]:
df1.set_index('Timestamp [ms]')

Dickey-Fuller Test: This is one of the statistical tests for checking stationarity. Here the null hypothesis is that the TS is non-stationary. The test results comprise of a Test Statistic and some Critical Values for difference confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = timeseries.rolling(window=12).mean()
    rolstd = timeseries.rolling(window=12).std()

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)

In [None]:
test_stationarity(df1)