In [1]:
import pandas as pd
import numpy as np
from ipynb.fs.defs.add_features import add_features
import os
import pandas_datareader as pdr
import time
from datetime import datetime, timedelta

# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, normalize
import random 

In [2]:
def Rand(start, end, num): 
    """ Return: List of random integers
    
    Parameters:
    start(int): Minimum list point
    end(int): Maximum list point
    num(int): Length of list"""
    
    res = [] 
    for j in range(num): 
        res.append(random.randint(start, end)) 
    return res 

# Number of trees in random forest
n_estimators = Rand(5, 1000, 10)
n_estimators.append(10)

# Maximum number of levels in tree
max_depth = Rand(1, 10, 3)
max_depth.append(None)

# Minimum number of samples required to split a node
learning_rate = [.01, .1, .2]

# Minimum number of samples required to split a node
min_samples_split = Rand(2, 10, 3)

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'learning_rate': learning_rate,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }

print(random_grid)

{'n_estimators': [632, 953, 591, 730, 441, 588, 179, 13, 783, 39, 10], 'max_depth': [5, 6, 5, None], 'learning_rate': [0.01, 0.1, 0.2], 'min_samples_split': [5, 5, 4], 'min_samples_leaf': [1, 2, 3, 4]}


In [3]:
def split_train(df):
    """Map training data to X and Y variables
    
    Parameters:
    df(dataframe): stock historical data
    
    Returns mapped training data"""
    
    # Map data to training features and target variable
    train_features = df.drop('close_diff', axis=1).iloc[:-1]
    X_train = StandardScaler().fit_transform(train_features)
    Y_train = df['close_diff'].iloc[1:]
    
    return X_train, Y_train

In [4]:
def gbr_hyperparams(random_grid, X_train, Y_train):
    """Perform a grid search based on the inputted
    parameters.
    
    Parameters:
    random_grid(dict): Dictionary of parameters for the grid
    to iterate over
    X_train(dataframe): features for predicting target variable
    Y_train(array): Target variables
    
    Returns ideal parameters for Gradient Boosting Regressor"""
    
    # Use the random grid to search for best hyperparameters
    gbr = GradientBoostingRegressor()

    # Random search of parameters, using 2 fold cross validation, 
    # Search across all combinations, and use all available cores
    gbr_random = GridSearchCV(estimator=gbr, param_grid=random_grid, 
                             cv=2, verbose=0, n_jobs=-1)

    # Fit the random search model
    gbr_random.fit(X_train, Y_train)

    params = gbr_random.best_params_
    
    return params

In [5]:
def gbr_forecast(df, random_grid):
    """Returns stock price forecast for next day
    
    Parameters:
    df(dataframe): stock historical data
    random_grid(dict): Dictionary of parameters for the grid
    to iterate over"""

    # Fit best parameters to GBR and predict stock price change
    X_train, Y_train = split_train(df)
    params = gbr_hyperparams(random_grid, X_train, Y_train)
    
    gbr = GradientBoostingRegressor(**params)

    gbr.fit(X_train, Y_train)
    
    # Create test set from last day in dataframe
    X_test = df.drop('close_diff', axis=1).iloc[-1]
    X_test = X_test.values.reshape(1, -1)
    Y_pred = gbr.predict(X_test)
    Y_pred = np.array(Y_pred).ravel()
    
    return Y_pred

In [6]:
def roi_calc(stock_list, api, random_grid):
    """Iterates through stock list and calculates ROI based
    on the ARIMA forecast.
    
    Parameters:
    stock_list(list): list of stocks to iterate
    api(str): api for Tiingo
    random_grid(dict): Dictionary of parameters for the grid
    to iterate over
    
    Returns list of top three stocks, forecast, and predicted ROI
    """

    start_time = time.time()
    roi = {}
    forecasts = {}
    
    for x in stock_list:
        # Create dataframe for each ticker
        df = pdr.get_data_tiingo(x, api_key=api)
        df = df.tail(1000)
        df.reset_index(inplace=True)
        df.index = df['date']
        stock_close = df['close']
        
        # Drop unneeded columns
        df.drop(['symbol', 'date', 'adjClose', 'adjHigh', 'adjLow', 
         'adjOpen', 'adjVolume', 'splitFactor'], axis=1, inplace=True)
        
        # Create new variable for stationary transformation of closing price
        df['close_diff'] = df['close'] - df['close'].shift()
        
        # Get the forecast for each ticker
        Y_pred = gbr_forecast(df, random_grid)
        
        # Calculate ROI
        last_close = df['close'][-1]
        forecast = Y_pred + last_close
        calc_roi = ((forecast - last_close) / last_close) * 100
        
        # Append to forecast and ROI dictionaries
        roi.update({x:calc_roi})
        forecasts.update({x:forecast})
        
    # Print ROI rates for top 3
    best_roi = sorted(roi, key=roi.get, reverse=True)[:3]  
    print('Top 3 Forecasted ROI:')
    
    for x in best_roi:
        print('{}: {}% Forecasted gain'.format(x, roi[x]))
        print('Forecasted Price: {}'.format(x, forecasts[x]))
    
    print("\n--- %s seconds ---" % (time.time() - start_time))  
    return roi, forecasts