#IMPORTS


In [None]:
%%capture
!pip install statsmodels==0.13.0

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller


In [None]:
from datetime import datetime
import pandas as pd

In [None]:
import numpy as np

In [None]:
from sklearn.preprocessing import StandardScaler

#Helper Fucntions

In [None]:

def split_dataset_for_time_series(dataframe_to_split,frac):

  length = int(frac*len(dataframe_to_split))
  
  train_data = dataframe_to_split.iloc[:length]

  test_data = dataframe_to_split.iloc[length:]

  return train_data,test_data


In [None]:
%%capture
def fetch_data_from_yfinance(ticker , start_date = datetime(2016, 10, 1), end_date = datetime(2021, 10, 1) ): #pass Datetime object, (year, month, day)
  !pip install yfinance
  import yfinance as yf
  
  dataFrame = yf.download(ticker, start = start_date, end = end_date)

  return dataFrame

In [None]:
def fetch_close_price_for_time_range(ticker,start_year,end_year):
  df = fetch_data_from_yfinance(ticker, start_date = datetime(start_year,1,1), end_date =datetime(end_year,1,1))
  dataset = df[['Close']]
  return dataset

In [None]:
sbi_close_price = fetch_close_price_for_time_range('SBIN.NS',2018,2019)

In [None]:
def plot_acf_custom(time_series , lags=25 , zero =False , ax = None):
  plot_acf(time_series, lags=lags, zero=False, ax=ax)



In [None]:
def plot_pacf_custom(time_series , lags=25 , zero =False , ax = None):
  plot_pacf(time_series, lags=lags, zero=False, ax=ax)

In [None]:
def decompose_time_series(time_series_data ,period):
  decomposed_result = seasonal_decompose(time_series_data , period = period)
  return decomposed_result


In [None]:
def difference_transform(time_series_data, lags):
  time_series_data_differenced = time_series_data.diff(periods= lags)
  time_series_data_differenced = time_series_data_differenced.dropna()
  return time_series_data_differenced


In [None]:
def fetch_data_within_a_time_range(dataset_with_datetime_index:pd.DataFrame , start_date:str , end_date:str):
  return dataset_with_datetime_index[start_date:end_date+1]


In [None]:
def is_stationary(attribute_values:pd.Series):

  print("Null Hypothesis H0: The series is non-stationary in trends!\n")
  resTuple = adfuller(attribute_values)
  print(resTuple)
  test_stat = resTuple[0]
  p_val = resTuple[1]
  print("Test statistics: ", test_stat)
  print("\nP- value of our series: ",p_val)
  print('\nThe Critical values at 1%, 5% and 10% level of significance are:\n ')
  print(resTuple[4])

  if (p_val < 0.05):
    print("\nWe can REJECT the null hypothesis H0. Our series is stationary")
  else:
    #Non Stationarity only in trend can be indication of random walks , when non statioanry only due to seasonality then may not be random walk , now this is what I understand.
    print("\nOur series is non-stationary, H0 is accepted.And a random walk") # also a random walk #but not all non stationary is random walk , dont know how to tell in that case. 

In [None]:
def plot_mean_and_variance(dataset:pd.DataFrame , attribute:str):
  #func to plot mean and variance of an attr given the dataframe and the attribute

  #Find mean and variance in groups of 100
  dict_of_means = {}
  dict_of_variance = {}

  list_of_df = np.array_split(dataset, 100)

  for df in list_of_df:
    mean = df[attribute].mean()
    var = df[attribute].var()
    
    dict_of_means[df.index[0]] = mean
    dict_of_variance[df.index[0]] = var  

  #plotting   
  plt.xlabel('Date')
  plt.ylabel('Value')

  plt.plot(list(dict_of_means.keys()), list(dict_of_means.values()),color = "Blue" ,label = 'Mean')
  plt.plot(list(dict_of_variance.keys()), list(dict_of_variance.values()),color="Green" ,label = 'Variance')

  plt.legend(['Mean','Variance'])

  plt.title("Mean an variance")
  plt.show()

In [None]:
def customize_dataset(data, window_size = 10):
 
  list1 = list()
  list2 = list()

  W = window_size
  
  range_length = len(data) - W
  
  for at_instance in np.arange(0, range_length):
    start = at_instance
    end = at_instance + W
    val1 = data[start: end]
    val2 = data[end]
    list1.append(val1)
    list2.append(val2)
  
  list1 = np.array(list1)##converted to numpy array
  list1 = list1.reshape(-1,W,1) ## why was this  done?? This is the dimension (mxnx1) form needed to be given to the RNN
  list2 = np.array(list2)
  N = len(list1)
  print("Dataset has been formulated, sending 3 values")
  return N, list1, list2

In [None]:
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.models import Model as mdl

In [None]:
from tensorflow.keras.layers import LSTM, SimpleRNN, Dense
from tensorflow.keras.layers import Input

### `compile Lstm (**D1**, window_size = default set to 10) `
###  Returns an initialized instance of LSTM

In [None]:
def compile_lstm(window_size,no_of_layers = 5,no_of_dense_layers = 1,loss = 'mse', optimizer = Adam , learning_rate = 0.1,sample_weight_mode= 'temporal'):
  
  W =window_size

  loss = loss

  optimizer = optimizer(learning_rate = learning_rate)

  shape = (W,1)

  input = Input(shape=shape)

  layer_rnn = LSTM(no_of_layers)(input)

  layer_rnn = Dense(no_of_dense_layers)(layer_rnn)

  model_rnn = mdl(input, layer_rnn)
  model_rnn.compile(loss=loss, optimizer = optimizer,sample_weight_mode=sample_weight_mode)

  return model_rnn

### `scaled_return(train_data,test_data,verbose=False) ---> returns **D1**`
###  Defining a generic function that standardizes the train and test set of the time series

In [None]:
def scaled_return(train_data,test_data,verbose=False):

  train_data = train_data.values.reshape(-1,1)
  test_data = test_data.values.reshape(-1,1)
  
  scaling_instance = StandardScaler().fit(train_data)
 
  train_scaled = scaling_instance.transform(train_data)
  test_scaled = scaling_instance.transform(test_data)

  if(verbose == True):
    print(f'Shape after scaling and before flattening: {scaled_data.shape}')
    print(f'Shape after scaling and flattening: {scaled_data.flatten().shape}')
  
  return (train_scaled.flatten(),test_scaled.flatten())

### Cross validate

###TSCV 
and why? beacuse normal train test splits works in IIDs but in time series there are temporal dependencies to be accounted for. The target and the attributes are not independent of each other

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html


In [None]:
def train_test_split_time_series(X,n_splits=5,attr ='Close',gap=0): 
  
  tscv = TimeSeriesSplit(gap = gap,n_splits=n_splits)
  generator_of_split = tscv.split(X)

  return generator_of_split #returns a generetor which will contain the indexes of where to split the train and the test data

In [None]:
def cross_validate_lstm(X, Y,attr='Close',n_splits=5,epochs=100,verbose=0): #returns a list of acc at each split
  
  tscv_generator = train_test_split_time_series(X,n_splits=n_splits,attr=attr)

  scores = []

  for train_index, test_index in tscv_generator:
        
        lstm_model = compile_lstm(window_size=10)
        
        lstm_model.fit(X[train_index], Y[train_index], epochs=epochs, verbose=verbose, shuffle=False)
        
        scores.append(lstm_model.evaluate(X[test_index],Y[test_index],verbose=0))
    
  return scores
        





##Rough work (Del latter)

In [1]:
import pandas as pd

In [10]:
X = pd.Series([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])

In [11]:
from sklearn.model_selection import TimeSeriesSplit

In [33]:
tscv = TimeSeriesSplit(gap =0,n_splits=2)
generator_of_split = tscv.split(X)

In [34]:
for train_index, test_index in generator_of_split:
  print("%s %s" % (train_index,test_index)) #why 2 step? , because of the number of splits? #not 2 step yoooo, train test split



[0 1 2 3 4] [5 6 7 8 9]
[0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14]


In [23]:
for train_index, test_index in generator_of_split:
  print("%s" % (train_index[-1])) #why 2 step? , because of the number of splits? #not 2 step yoooo, train test split


4
6
8
10
12
