In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import glob

def standardize_data(path = '/content/drive/Shareddrives/MATH456/FRED CSVs'):
  files = glob.glob(path + "/*.csv")

  dataset = []

  #this loop combines all of the data together into a single csv file
  for single_file in files:
      df = pd.DataFrame(pd.read_csv(single_file))

      #retrieves name of the variable we want
      variable = list(df.columns)[1]
      df['DATE'] = pd.to_datetime(df['DATE'])

      #since not all of the data is the same, we had figured that standardizing
      #it to quarterly averages will be the best way to attack this issue
      df['quarter'] = df['DATE'].dt.to_period('Q')

      #this gets the name of the numeric dataset
      df = df[df[variable] != '.']

      #good data practice to convert calculation data into numerical values
      #as these were string data before
      df[variable] = pd.to_numeric(df[variable])

      #we only want data from the latest first date in each dataset to
      #earliest end date in each dataset. will be changed if needed
      df = df.loc[df['DATE'].between('2013-01-01','2022-07-01', inclusive=True)]
      dataset.append(df.groupby(['quarter'])[variable].mean().round(2))

  combined = pd.concat(dataset, axis=1).fillna(0)

  #Using the NASDAQ as a predictor and response variable
  change_up_NASDAQ = pd.DataFrame(combined["NASDAQCOM"].shift(periods=-1))
  change_up_NASDAQ = change_up_NASDAQ.rename(columns= {"NASDAQCOM": "NASDAQY"})

  #dropping last row to match goal of predicting the trend of NASDAQ
  complete_set = pd.concat([combined,change_up_NASDAQ],axis=1)
  complete_set = complete_set.drop(complete_set.index[len(complete_set)-1])

  complete_set = complete_set.rename(columns= {"NASDAQCOM": "NASDAQX"})

  #renaming the columns to name of the file to make easier for us to use
  for name in range(len(files)):
    files[name] = files[name].split('/',files[name].count('/'))
    files[name] = files[name][-1].split('.csv')[0].lstrip()

  files.remove(files[0])
  files.insert(0,'NASDAQX')
  files.insert(len(files),'NASDAQY')

  complete_set.columns = files

  #wrote to new file outside of where the raw datasets were
  #this is helpful in case we need to make changes to dates needed in future
  complete_set.to_csv(path.strip('FRED CSVs') + 'Cleaned.csv')


  #Since each of the variables have different magnitude, we are implementing
  #a naive manner to standardize the unit of all of the variables. This can
  mean = list(complete_set.mean())
  stdev = list(complete_set.std())

  nasdaqx = complete_set.columns[0]
  largebankcc = complete_set.columns[4]
  quarterfinancereport = complete_set.columns[7]

  complete_set[nasdaqx] = (complete_set[nasdaqx] - mean[0]) / stdev[0]
  complete_set[largebankcc] = (complete_set[largebankcc] - mean[4]) / stdev[4]
  complete_set[quarterfinancereport] = (complete_set[quarterfinancereport] - mean[7]) / stdev[7]

  complete_set.to_csv(path.strip('FRED CSVs') + 'NormalizeClean.csv')

  return "Done"

standardize_data()

  df = df.loc[df['DATE'].between('2013-01-01','2022-07-01', inclusive=True)]


'Done'