<a href="https://colab.research.google.com/github/atlantiquesun/Stock_ML/blob/main/Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn import preprocessing
import datetime
import numpy as np
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.listdir("/content/drive/MyDrive/StockML /Data/")

['papers.csv',
 'companyInfo',
 'starHistory',
 '.ipynb_checkpoints',
 'forkHistory',
 'issueHistory',
 'commitHistory',
 'issueClosedHistory',
 'pullRequestClosedHistory',
 'pullRequestMergedHistory',
 'pullRequestHistory',
 'processedData',
 'trainData',
 'financialData']

In [5]:
#calculate the daily cumulative for each company
def calculate_cumulative(start=0, end=82):
  '''
  start: start company (index 0 to 81)
  end: end company (the last company to be processed)
  last end: 3 (elastic is not processed)
  '''
  companies = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/companies_final.csv")
  dataCategories = []
  for name in os.listdir("/content/drive/MyDrive/StockML /Data/"):
    if "History" in name: #if is a part of the raw data
      dataCategories.append(name)

  for i in range(companies.shape[0]):
    if (i < start):continue
    company = companies.at[i, 'githubUser']
    normalizedName = companies.at[i, "shortName"]
    print(company, normalizedName)
    cumulativeData = {}
    normalizedCmltData = {} #normalized cumulative data (each time series is normalized)
    cumulativeData["date"] = list(pd.date_range(start="1/01/1999", end='9/01/2021').tz_localize(None)) #need to check the timezone
    normalizedCmltData["date"] = list(pd.date_range(start="1/01/1999", end='9/01/2021').tz_localize(None))
    #calculate the cumulative data for each category
    for category in dataCategories:
      print(category[:-7])
      df = pd.read_csv("/content/drive/MyDrive/StockML /Data/"+category+"/"+company+".csv")
      df["sum"] = df.sum(axis=1) #sum over the repositories
      cumulativeData[category[:-7]] = list(df["sum"])
      #normalize data
      sumDf = pd.DataFrame(df["sum"])
      nSumDf = (sumDf-sumDf.min())/(sumDf.max()-sumDf.min())
      normalizedCmltData[category[:-7]] = list(nSumDf["sum"])

    cumulativeData = pd.DataFrame(cumulativeData)
    cumulativeData.to_csv("/content/drive/MyDrive/StockML /Data/processedData/cumulativeData/"+company+".csv")
    normalizedCmltData = pd.DataFrame(normalizedCmltData)
    normalizedCmltData.to_csv("/content/drive/MyDrive/StockML /Data/processedData/normalizedCumulativeData/"+company+".csv")
    if(i==end): break

# Prepare Train Data

In [25]:
#1999/01/01 is a friday, so actually start from 1999/01/02 to 2021/04/30
#currently a problem: the largest value in an entry (e.g. [1999-01-02, 'commit']) is not 1 because of the summation over a week

def prepare_company_train_data(start=0, end=82, lag=1):
  '''
  cluster into weeks, concatenate github and stock data

  lag: number of weeks the github data is lagged behind the stock data
  '''
  df_complete = None
  companies = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/companies_final.csv")

  for i in range(companies.shape[0]):
    if (i < start): continue
    company = companies.at[i, "githubUser"]
    print(company)
    ticker = companies.at[i, "symbol"].upper()
    raw1 = pd.read_csv("/content/drive/MyDrive/StockML /Data/processedData/cumulativeData/"+company+".csv")
    raw2 = pd.read_csv("/content/drive/MyDrive/StockML /Data/processedData/normalizedCumulativeData/"+company+".csv")

    start_date = pd.to_datetime("1999-01-02").tz_localize(None)
    end_date = pd.to_datetime("2021-04-30").tz_localize(None)
    raw1['date'] = pd.to_datetime(raw1['date'])
    raw2['date'] = pd.to_datetime(raw2['date'])
    mask = (raw1['date'] >= start_date) & (raw1['date'] <= end_date) # ensure full weeks
    raw1 = raw1.loc[mask].reset_index()
    mask = (raw2['date'] >= start_date) & (raw2['date'] <= end_date)
    raw2 = raw2.loc[mask].reset_index()

    weekStarts = raw1[raw1.index%7==0].reset_index()
    weekStarts = weekStarts['date'] #the dates on which a week starts (a Saturday)
    weekEnds = raw1[raw1.index%7==6].reset_index()
    weekEnds = weekEnds['date'] #the dates on which a week ends (a Friday)

    raw1 = raw1.drop(["index", "Unnamed: 0"], axis = 1)
    raw2 = raw2.drop(["index", "Unnamed: 0"], axis = 1)
    raw2 = raw2.rename(columns = lambda s: 'n'+s)
    raw = pd.concat([raw1, raw2], axis=1)

    df = raw.groupby(np.floor(raw.index/7)).sum() #group into weeks
    df['weekStarts'] = weekStarts
    df['weekEnds'] = weekEnds

    stock_data = pd.read_csv("/content/drive/MyDrive/StockML /Data/financialData/"+ticker+".csv")
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])

    stock_data = stock_data[stock_data['Date'].isin(list(weekEnds))].reset_index() #select a week's close price
    df = df[df['weekEnds'].isin(list(stock_data['Date']))].reset_index() #in some weeks there are no stock trade, or the stock was not yet on the market
    
    closePrices = list(stock_data['Close'])
    weeklyReturn = [closePrices[0]]
    for j in range(1, len(closePrices)):
      wr = (closePrices[j]/closePrices[j-1])-1
      weeklyReturn.append(wr)

    df['weeklyReturn'] = pd.Series(weeklyReturn)
    df['weeklyReturn'] = df['weeklyReturn'].shift(-lag)
    print(df.shape[0])

    df.to_csv("/content/drive/MyDrive/StockML /Data/trainData/companies/"+company+".csv")
    df['ticker'] = ticker
    if(df_complete is None):
      df_complete = df
    else:
      df_complete = pd.concat([df_complete, df])

    if (i>=end):
      break

  df_complete.to_csv("/content/drive/MyDrive/StockML /Data/trainData/data.csv") 
  return df_complete
  

In [26]:
#calculate_cumulative(start=0, end=8)
df = prepare_company_train_data(start=0, end=8, lag=1)

amzn
1125
slackapi
93
cisco
1125
elastic
129
netflix
955
pinterest
101
shutterstock
431
intuit
1125
okta
205


In [27]:
display(df)

Unnamed: 0,index,star,fork,issue,commit,issueClosed,pullRequestClosed,pullRequestMerged,pullRequest,nstar,nfork,nissue,ncommit,nissueClosed,npullRequestClosed,npullRequestMerged,npullRequest,weekStarts,weekEnds,weeklyReturn,ticker
0,0.0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1999-01-02,1999-01-08,-0.124025,AMZN
1,1.0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1999-01-09,1999-01-15,-0.123776,AMZN
2,2.0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1999-01-16,1999-01-22,-0.049289,AMZN
3,3.0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1999-01-23,1999-01-29,-0.009086,AMZN
4,4.0,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1999-01-30,1999-02-05,-0.098166,AMZN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,1159.0,32,12,19,275,18,123,84,117,1.684211,0.250000,0.246753,0.838415,0.191489,1.782609,1.909091,2.294118,2021-03-20,2021-03-26,0.149729,OKTA
201,1161.0,27,25,23,130,22,82,40,77,1.421053,0.520833,0.298701,0.396341,0.234043,1.188406,0.909091,1.509804,2021-04-03,2021-04-09,0.105298,OKTA
202,1162.0,24,11,18,106,12,68,35,58,1.263158,0.229167,0.233766,0.323171,0.127660,0.985507,0.795455,1.137255,2021-04-10,2021-04-16,0.034521,OKTA
203,1163.0,27,25,24,101,19,77,36,115,1.421053,0.520833,0.311688,0.307927,0.202128,1.115942,0.818182,2.254902,2021-04-17,2021-04-23,-0.034372,OKTA


# Model

In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import Ridge

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV,RandomizedSearchCV

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

def train_linear_regression(X_train,y_train):

    lr_regressor = LinearRegression()
    model = lr_regressor.fit(X_train, y_train)
    return model


def train_lasso(X_train, y_train):
    # lasso_regressor = Lasso()
    # model = lasso_regressor.fit(X_train, y_train)

    lasso = Lasso()
    # scoring_method = 'r2'
    # scoring_method = 'explained_variance'
    scoring_method = 'neg_mean_absolute_error'
    # scoring_method = 'neg_mean_squared_error'
    #scoring_method = 'neg_mean_squared_log_error'
    parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
    # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced)
    lasso_regressor = GridSearchCV(lasso, parameters, scoring=scoring_method, cv=3)
    lasso_regressor.fit(X_train, y_train)

    model = lasso_regressor.best_estimator_
    return model

def train_ridge(X_train, y_train):
    # lasso_regressor = Lasso()
    # model = lasso_regressor.fit(X_train, y_train)

    ridge = Ridge()
    # scoring_method = 'r2'
    # scoring_method = 'explained_variance'
    scoring_method = 'neg_mean_absolute_error'
    # scoring_method = 'neg_mean_squared_error'
    #scoring_method = 'neg_mean_squared_log_error'
    parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
    # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced)
    ridge_regressor = GridSearchCV(ridge, parameters, scoring=scoring_method, cv=3)
    ridge_regressor.fit(X_train, y_train)

    model = ridge_regressor.best_estimator_
    return model

def train_random_forest(X_train, y_train):
    '''
    random_grid = {'bootstrap': [True, False],
                   'max_depth': [10, 20, 40, 80, 100, None],
                   'max_features': ['auto', 'sqrt'],
                   'min_samples_leaf': [1, 2, 5, 10],
                   'min_samples_split': [2, 5, 10],
                   'n_estimators': [50, 200, 400, 600, 800, 1000, 1500]}
    # my_cv_rf = TimeSeriesSplit(n_splits=5).split(X_train_rf)
    rf = RandomForestRegressor(random_state=42)
    randomforest_regressor = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                                                cv=3, n_jobs=-1, scoring='neg_mean_absolute_error', verbose=0)
    randomforest_regressor.fit(X_train, y_train)
    model = randomforest_regressor.best_estimator_
    '''
    randomforest_regressor = RandomForestRegressor(n_estimators = 500, max_features=6)
    #randomforest_regressor = RandomForestRegressor(random_state = 42,n_estimators = 300)

    model = randomforest_regressor.fit(X_train, y_train)
    
    return model


def train_svm(X_train, y_train):
    svr = SVR(kernel = 'rbf')

    param_grid_svm = {'C':[0.001, 0.01, 0.1, 1, 10],'gamma': [1e-7, 1e-4,0.001,0.1]}
    #param_grid_svm = {'kernel': ('linear', 'rbf','poly'), 'C':[0.001, 0.01, 0.1, 1, 10],'gamma': [1e-7, 1e-4,0.001,0.1],'epsilon':[0.1,0.2,0.5,0.3]}

    # scoring_method = 'r2'
    # scoring_method = 'explained_variance'
    scoring_method = 'neg_mean_absolute_error'
    # scoring_method = 'neg_mean_squared_error'
    #scoring_method = 'neg_mean_squared_log_error'
    
    svm_regressor = GridSearchCV(estimator=svr, param_grid=param_grid_svm,
                                       cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)

    svm_regressor.fit(X_train, y_train)
    model = svm_regressor.best_estimator_

    return model


def train_gbm(X_train, y_train):
    '''gbm = GradientBoostingRegressor(random_state=42)
    # model = gbm.fit(X_train, y_train)
    param_grid_gbm = {'learning_rate': [0.1, 0.05, 0.01, 0.001], 'n_estimators': [100, 250, 500, 1000]}
    # scoring_method = 'r2'
    # scoring_method = 'explained_variance'
    scoring_method = 'neg_mean_absolute_error'
    # scoring_method = 'neg_mean_squared_error'
    #scoring_method = 'neg_mean_squared_log_error'
    gbm_regressor = RandomizedSearchCV(estimator=gbm, param_distributions=param_grid_gbm,
                                       cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
    gbm_regressor.fit(X_train, y_train)
    model = gbm_regressor.best_estimator_'''
    
    gbm_regressor = GradientBoostingRegressor()
    model = gbm_regressor.fit(X_train, y_train)

    return model

def train_ada(X_train, y_train):
    ada = AdaBoostRegressor(random_state=1)

    # model = ada.fit(X_train, y_train)

    param_grid_ada = {'n_estimators': [20, 50, 100],
                      'learning_rate': [0.01, 0.05, 0.1, 0.3, 1],
                      'loss' : ['linear', 'square', 'exponential']
                     
                     }
    # scoring_method = 'r2'
    # scoring_method = 'explained_variance'
    scoring_method = 'neg_mean_absolute_error'
    # scoring_method = 'neg_mean_squared_error'
    #scoring_method = 'neg_mean_squared_log_error'

    ada_regressor = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)

    ada_regressor.fit(X_train, y_train)
    model = ada_regressor.best_estimator_
    '''
    ada_regressor = AdaBoostRegressor()
    model = ada_regressor.fit(X_train, y_train)
    '''
    return model

def train_lstm(X_train, y_train, n_features=1):
    
    # Initialising the RNN
    regressor = Sequential()
    # Adding the first LSTM layer and some Dropout regularisation
    regressor.add(LSTM(units = 80, return_sequences = True, input_shape = (X_train.shape[1], n_features)))
    regressor.add(Dropout(0.2))

    # Adding a second LSTM layer and some Dropout regularisation
    regressor.add(LSTM(units = 40, return_sequences = True))
    regressor.add(Dropout(0.2))

    # Adding a third LSTM layer and some Dropout regularisation
    regressor.add(LSTM(units = 20, return_sequences = False))
    regressor.add(Dropout(0.2))

    # Adding a fourth LSTM layer and some Dropout regularisation
    #regressor.add(LSTM(units = 20,return_sequences = False))
    #regressor.add(Dropout(0.2))

    # Adding the output layer
    regressor.add(Dense(units = 1, activation='linear'))
    
    #scoring_method = 'neg_mean_absolute_error'
    # scoring_method = 'neg_mean_squared_error'
    #scoring_method = 'neg_mean_squared_log_error'
    # Compiling the RNN
    regressor.compile(optimizer = 'adam', loss = 'mean_absolute_error')

    # Fitting the RNN to the Training set
    regressor.fit(X_train, y_train, epochs = 4, batch_size = 64)
    print(regressor.summary())
    return regressor

# Training
X_train = (samples * timesteps, features, 1) for lstm
same as the original Big Data paper

In [51]:
def prepare_train_data(df, features_column, start_date, train_windows):
  start_date = pd.to_datetime(start_date)
  end_date = start_date + datetime.timedelta(weeks = train_windows)
  
  df["weekStarts"] = pd.to_datetime(df["weekStarts"])
  df["weekEnds"] = pd.to_datetime(df["weekEnds"])
  mask = (df["weekStarts"] >= start_date) & (df["weekEnds"] < end_date)
  train = df.loc[mask]
  X_train=train[features_column]
  y_train=train["weeklyReturn"]
  return (X_train, y_train)

def prepare_test_data(df, features_column, train_start_date, train_windows, test_windows):
  start_date = pd.to_datetime(train_start_date)+datetime.timedelta(weeks = train_windows)
  end_date = start_date+datetime.timedelta(weeks = test_windows)

  df["weekStarts"] = pd.to_datetime(df["weekStarts"])
  df["weekEnds"] = pd.to_datetime(df["weekEnds"])
  mask = (df["weekStarts"] >= start_date) & (df["weekEnds"] < end_date)
  test = df.loc[mask]
  X_test=test[features_column]
  y_test=test["weeklyReturn"]
  return (X_test, y_test)

def prepare_trade_data(df, features_column, trade_date):
  #not the actual start date of the week been predicted, but the start date of the week before the week been predicted, since GitHub data is lagged for one week
  
  trade = df.loc[df["weekStarts"].isin([trade_date])]
  #print(trade.shape)
  X_trade = trade[features_column]
  y_trade = trade['weeklyReturn']
  trade_tic = trade['ticker']
  return (X_trade, y_trade, trade_tic)

In [46]:
def evaluate_model(model, X_test, y_test):
    from sklearn.metrics import mean_squared_error
    #from sklearn.metrics import mean_squared_log_error

    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import explained_variance_score
    from sklearn.metrics import r2_score
    y_predict = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_predict)
    

    mse = mean_squared_error(y_test, y_predict)
    #msle = mean_squared_log_error(y_test, y_predict)

    explained_variance = explained_variance_score(y_test, y_predict)
    r2 = r2_score(y_test, y_predict)

    return mae

In [47]:
features_column = ['nstar', 'nfork', 'nissue', 'ncommit', 'nissueClosed', 'npullRequest', 'npullRequestClosed', 'npullRequestMerged']
#train_start_date = "2012-01-07" #first Saturday in 2012
train_windows = 48
test_windows = 12

def date_range(start_date, end_date):
  dates = []
  for n in range(0, int((end_date - start_date).days) + 1, 7):
    dates.append(start_date + datetime.timedelta(n))
  return dates

start_date = datetime.date(2012, 1, 7) #Saturday
end_date = datetime.date(2020, 2, 22) #Saturday
train_start_dates = date_range(start_date, end_date)
trade_dates = [x+datetime.timedelta(weeks=train_windows+test_windows) for x in train_start_dates]

print(len(train_start_dates))
print(trade_dates[-1])

425
2021-04-17


In [None]:
evaluation_record = []

for i in range(len(trade_dates)):
  train_start_date = train_start_dates[i]
  trade_date = trade_dates[i]

  X_train, y_train = prepare_train_data(df, features_column, train_start_date, train_windows)
  X_train_lstm = np.reshape(X_train.values, (X_train.values.shape[0], X_train.values.shape[1], 1))

  X_test, y_test = prepare_test_data(df, features_column, train_start_date, train_windows, test_windows)
  X_test_lstm = np.reshape(X_test.values, (X_test.values.shape[0], X_test.values.shape[1], 1))

  print("Trading for week", trade_date+datetime.timedelta(weeks=1), "to", trade_date+datetime.timedelta(13))
  X_trade, y_trade, trade_tic = prepare_trade_data(df, features_column, trade_date)
  X_trade_lstm = np.reshape(X_trade.values, (X_trade.values.shape[0], X_trade.values.shape[1], 1))

   # Train
  lr_model = train_linear_regression(X_train, y_train)
  lasso_model = train_lasso(X_train, y_train)
  ridge_model = train_ridge(X_train, y_train)

  rf_model = train_random_forest(X_train, y_train)
  svm_model = train_svm(X_train,y_train)
  
  #gbm_model = train_gbm(X_train, y_train)
  #ada_model = train_ada(X_train, y_train)
  lstm_model = train_lstm(X_train_lstm, y_train)

    # Validation 
  lr_eval = evaluate_model(lr_model, X_test, y_test)
  lasso_eval = evaluate_model(lasso_model, X_test, y_test)
  ridge_eval = evaluate_model(ridge_model, X_test, y_test)
  
  rf_eval = evaluate_model(rf_model, X_test, y_test)
  
  svm_eval = evaluate_model(svm_model, X_test, y_test)

  #gbm_eval = evaluate_model(gbm_model, X_test, y_test)
  #ada_eval = evaluate_model(ada_model, X_test, y_test)
  
  lstm_eval = evaluate_model(lstm_model, X_test_lstm, y_test)

        
  # Trade
  y_trade_lr = lr_model.predict(X_trade)
  y_trade_lasso = lasso_model.predict(X_trade)
  y_trade_ridge = ridge_model.predict(X_trade)
  
  y_trade_rf = rf_model.predict(X_trade)
  
  y_trade_svm = svm_model.predict(X_trade)

  #y_trade_gbm = gbm_model.predict(X_trade)
  #y_trade_ada = ada_model.predict(X_trade)
  y_trade_lstm = lstm_model.predict(X_trade_lstm).flatten()

  eval_data = [[lr_eval, y_trade_lr], 
                    [lasso_eval, y_trade_lasso],
                     [ridge_eval, y_trade_ridge],
                     [rf_eval, y_trade_rf], 
                     [svm_eval,y_trade_svm],
               #      [gbm_eval,y_trade_gbm],                     
               #      [ada_eval,y_trade_ada],
                    [lstm_eval,y_trade_lstm]

                    ]

  eval_table = pd.DataFrame(eval_data, columns=['model_eval', 'model_predict_return'],
                                  index=['lr', 'lasso','ridge','rf', 'svm','lstm'])  
  
  evaluation_record.append((trade_date+datetime.timedelta(weeks=1), eval_table))



# Prepare Train Data 
X_train = (samples, timesteps, features) for lstm

In [60]:
def prepare_train_data_2(df, start_date="2015-01-02", train_size=48, window_size=20,
                       features_column = ['nstar', 'nfork', 'nissue', 'ncommit', 'nissueClosed', 'npullRequest', 'npullRequestClosed', 'npullRequestMerged']):
  '''
  input
  start_date: always a Saturday (the start day of a "stock" week)
  train_size: number of weeks included in the training set
  window_size: number of weeks in each training sample

  return
  X_train: (samples, timesteps, features)
  '''
  
  start_date = pd.to_datetime(start_date)
  end_date = start_date + datetime.timedelta(weeks = train_size)
  print(start_date, end_date)
  
  companies = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/companies_final.csv")
  X_train = []
  y_train = []
  for i in range(companies.shape[0]):
    if (i<start_company):
      continue
    if(i>end_company):
      break

    company = companies.at[i, "githubUser"]
    df = pd.read_csv("/content/drive/MyDrive/StockML /Data/trainData/companies/"+company+".csv")
    df["weekStarts"] = pd.to_datetime(df["weekStarts"])
    df["weekEnds"] = pd.to_datetime(df["weekEnds"])
    mask = (df["weekStarts"] >= start_date) & (df["weekEnds"] < end_date)
    df_temp = df.loc[mask].reset_index()
    print(company, df_temp.shape)
    df_x = df_temp[features_column]
    df_y = df_temp["weeklyReturn"]

    if (df_temp.shape[0]<window_size): 
      continue
    for j in range(df_temp.shape[0]-window_size):
      X_train.append(df_x.values[j:j+window_size, :])
      y_train.append(df_y.values[j:j+window_size])
  
  X_train = np.asarray(X_train)
  y_train = np.asarray(y_train)
  y_train = np.reshape(y_train[:, -1], (y_train.shape[0], 1))
  return (X_train, y_train)
    
    
  


    

  

In [61]:
def prepare_test_data_2(start_company=0, end_company=82, train_start_date="2021-04-30", train_size=48, test_size=24, window_size=20,
                       features_column = ['nstar', 'nfork', 'nissue', 'ncommit', 'nissueClosed', 'npullRequest', 'npullRequestClosed', 'npullRequestMerged']):
  '''
  input 
  test_size: number of weeks included in the testing set

  return
  X_test: (samples, timesteps, features)
  y_test: (samples, timesteps) 
  '''
  
  start_date = pd.to_datetime(train_start_date)+datetime.timedelta(weeks = train_size)
  end_date = start_date+datetime.timedelta(weeks = test_size)
  
  companies = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/companies_final.csv")
  X_test = []
  y_test = []
  for i in range(companies.shape[0]):
    if (i<start_company):
      continue
    if(i>end_company):
      break

    company = companies.at[i, "githubUser"]
    df = pd.read_csv("/content/drive/MyDrive/StockML /Data/trainData/companies/"+company+".csv")
    df["weekStarts"] = pd.to_datetime(df["weekStarts"])
    df["weekEnds"] = pd.to_datetime(df["weekEnds"])

    mask = (df["weekStarts"] >= start_date) & (df["weekEnds"]< end_date)
    df_temp = df.loc[mask].reset_index()
    df_x = df_temp[features_column]
    df_y = df_temp["weeklyReturn"]

    if (df_temp.shape[0]<window_size): 
      continue
    for j in range(df_temp.shape[0]-window_size):
      X_test.append(df_x.values[j:j+window_size, :])
      y_test.append(df_y.values[j:j+window_size])
  
  X_test = np.asarray(X_test)
  y_test = np.asarray(y_test)
  y_test = np.reshape(y_test[:, -1], (y_test.shape[0], 1)) #only select the last data
  return (X_test, y_test)
    

In [64]:
train_start_date = "2015-02-28"
train_size = 48 #number of weeks for training
test_size = 24 #number of weeks for testing
window_size = 20 #number of weeks in a (training/testing) sample
features_column = ['nstar', 'nfork', 'nissue', 'ncommit', 'nissueClosed', 'npullRequest', 'npullRequestClosed', 'npullRequestMerged']

X_train, y_train = prepare_train_data(start_company=0, end_company=8, train_size=train_size, window_size=window_size, start_date=train_start_date)
print("train:", X_train.shape, y_train.shape)
X_test, y_test = prepare_test_data(start_company=0, end_company=8, train_start_date=train_start_date, test_size=test_size, window_size=window_size)
print("test:", X_test.shape, y_test.shape)



TypeError: ignored

In [None]:
lstm_model = train_lstm(X_train, y_train, n_features = len(features_column))
lstm_eval = evaluate_model(lstm_model, X_test, y_test)

## Simple Visualisation

In [None]:
import requests
company = "amzn"
response = requests.get("https://api.github.com/users/"+company)
data = response.json()
created_at = data['created_at']

In [None]:
from pandas_datareader import data as pdr
from datetime import date

#natural date range (creation date of the company's github page --- 2021-04-30)
start_date = pd.to_datetime(created_at[:10]).tz_localize(None)
end_date = pd.to_datetime("2021-04-30").tz_localize(None)
#for amzn, the first week start on 2014-09-02, the last week end on 2021-04-30
start_date = pd.to_datetime("2016-09-02").tz_localize(None)

#select the stock data within the date range
stockPrice = pd.read_csv("/content/drive/MyDrive/StockML /Data/financialData/AMZN.csv")
stockPrice['Date'] = pd.to_datetime(stockPrice['Date'])
mask = (pd.to_datetime(stockPrice['Date'])>= start_date) & (pd.to_datetime(stockPrice['Date']) <= end_date)
stockDataSelected = stockPrice.loc[mask].reset_index()
display(stockDataSelected)

#select the cumulative data within the date range
cumulativeData = pd.read_csv("/content/drive/MyDrive/StockML /Data/processedData/normalizedCumulativeData/amzn.csv")
cumulativeData["date"] = pd.to_datetime(cumulativeData["date"])
mask = (cumulativeData['date'] >= start_date) & (cumulativeData['date'] <= end_date)
cumulativeDataSelected = cumulativeData.loc[mask].reset_index()
display(cumulativeDataSelected)

In [None]:
import scipy.stats
import numpy as np

x = np.array(list(stockDataSelected['Open']))
y = []

#remove the dates not in x
stockDates = list(stockDataSelected['Date'])
for i in range(cumulativeDataSelected.shape[0]):
  if(cumulativeDataSelected.at[i, 'date'] in stockDates):
    y.append(cumulativeDataSelected.at[i, 'issue'])
y = np.array(y)

print(x.shape, y.shape)

print("Pearson:", scipy.stats.pearsonr(x, y))
print("Spearman:", scipy.stats.spearmanr(x, y))
print("Kendall:", scipy.stats.kendalltau(x, y))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
#sns.lineplot(x = "Date", y = "Close", data = stockDataSelected)
#sns.lineplot(x = "date", y = "star", data = cumulativeDataSelected)

fig,ax =  plt.subplots( 2, 2, figsize = ( 18, 8))

sns.lineplot(x = "Date", y = "Close", color = "r", data = stockDataSelected, ax = ax[0][0])
ax[0][0].tick_params(labelrotation = 25)

sns.lineplot(x = "date", y = "star", color = "g", data = cumulativeDataSelected, ax = ax[0][1])
ax[0][1].tick_params(labelrotation = 25)

sns.lineplot(x = "date", y = "commit", color = "y", data = cumulativeDataSelected, ax = ax[1][0])
ax[1][0].tick_params(labelrotation = 25)

sns.lineplot(x = "date", y = "issue", color = "b", data = cumulativeDataSelected, ax = ax[1][1])
ax[1][1].tick_params(labelrotation = 25)
