In [5]:
from sklearn import  linear_model
import matplotlib.pyplot as pl

import yfinance as yf
import pandas as pd
import numpy as np
from fbprophet import Prophet
from pathlib import Path
import pickle

#!pip install yfinance

from datetime import date

stock_coefficient_file_name= './../resources/Stock_Coefficients.csv'

# Considering data from 2000 to till date for latest trend
start = '2018-01-01'
end = date.today().strftime("%Y-%m-%d")


In [6]:
# function1 to get stock list from text file
def get_list(filename):
    # pandas read file into data frame
    stocks = pd.read_csv(filename, header = None)
    stocks[0] = stocks[0].str.upper()
    # convert the data frame to numpy array
    stock_array = stocks.values
    return stock_array

In [7]:

from pandas_datareader._utils import RemoteDataError

# Considering data from 2018 to 2019 for latest trend
# start = '2018-01-01'
# end = '2022-02-01' #date.today().strftime("%Y-%m-%d")

# function2: get historical data for a specific stock, using symbol as the parameter
def load_stock_data(symbol):
    try:

        df = yf.download(symbol , start , end )
        #web.DataReader(symbol, 'yahoo', start, end)
    except RemoteDataError:
        print("No search result : '" + symbol + "'")
        return float('NaN')
    except KeyError:
        print("Date range not supported : '" + symbol + "'")
        return float('NaN')
    close = df[['Close']]

    # Normalize close value by divided by the base value (the close price of the earliest day) and multiplied by 100
    base = close.iloc[0]['Close']
    # neglect the stocks the initial close price < 25 USD to insure the selected stock value
    if base > 10 :
        close = close.assign(Close = close['Close'] / base * 100)
    return close



from sklearn.model_selection import train_test_split

# function3: build linear regression model for a specific stock
# parameter
# show_statistics: indicate if the statistics are printed
# show_dots: indicate if dots data needed
# show_plot: indacate if the plot is shown

#The coefficient estimates for Ordinary Least Squares rely on the independence of the features.
#When features are correlated and the columns of the design matrix  have an approximately linear dependence,
#the design matrix becomes close to singular and as a result, the least-squares estimate becomes highly sensitive to random errors in the observed target,
#producing a large variance.

def build_linear_regression(symbol, show_statistics = True, show_dots = True, show_plot = True):
    close = load_stock_data(symbol)
    # if the close is not dataframe, return NaN
    if not isinstance(close, pd.DataFrame) :
        return float('NaN')

    # normalize datatime datatype to integer
    # simply converting datetime's to # of days since 2018-04-01 divided by 100
    close.index = (close.index - pd.to_datetime('2019-01-01')).days / 100
    close = close.reset_index()
    train, test= train_test_split(close)

    train_x = train.drop('Close', axis = 1)
    train_y = train[['Close']]
    test_x = test.drop('Close', axis = 1)
    test_y = test[['Close']]

    #call linear regression model
    regr = linear_model.LinearRegression()
    regr.fit(train_x,train_y)

    if show_statistics :
        # The coefficients
        print('Coefficients         ->', regr.coef_[0])
        # The mean squared error
        print("Mean squared error   -> %.2f"
              % np.mean((regr.predict(test_x) - test_y) ** 2))
        # Explained variance score: 1 is perfect prediction
        print('Variance score       -> %.2f' % regr.score(test_x, test_y))
        print('features             ->', 'Date', 'Stock Close Price')

    if show_plot :
        pl.plot(test_x, regr.predict(test_x), label = symbol)
        pl.legend()

        # Plot outputs
        pl.xlabel('Date')
        pl.ylabel('Close Price')
        pl.title('Overall Linear Regression Model')

        if show_dots :
            #pl.title(symbol + ' Linear Regression Model')
            pl.plot(test['Date'], test['Close'], linestyle='none', marker='o')

        pl.xticks(())
        pl.yticks(())
    # return the coefficient representing trend
    return regr.coef_[0][0]

# function4: for each symbol in the text file, caculate the coefficents and record in the original text file
def get_coefficient_dataset(filename, show_statistics = False, show_dots = False, show_plot = False) :
    # load stock list data from text file
    stock_array = get_list(filename)
    # extend the 2D array from N * 1 to N * 2 to make place for putting corresponding coefficient
    stock_array = np.insert(stock_array, 1, values=0, axis=1)

    # caculate coefficient for each symbol and store in the stock_array
    for symbol in stock_array:
        print(symbol)
        coefficient = build_linear_regression(symbol[0], show_statistics, show_dots, show_plot)
        symbol[1] = coefficient
        #break

    # transfer stock_array to dataframe with two columns 'Stock' and 'Coefficient'
    coefficient_data = pd.DataFrame(stock_array)
    coefficient_data.columns = ['Stock', 'Coefficient']

    # store the dataframe to csv file for future using
    coefficient_data.to_csv(stock_coefficient_file_name)
    return coefficient_data

In [8]:
# function4 using example using stock_list_test.txt
coefficient_data_test = get_coefficient_dataset('./../resources/stock_list.txt')
coefficient_data_test

['AARTIIND.NS' 0]
[*********************100%***********************]  1 of 1 completed
['ABBOTINDIA.NS' 0]
[*********************100%***********************]  1 of 1 completed
['ACC.NS' 0]
[*********************100%***********************]  1 of 1 completed
['ADANIENT.NS' 0]
[*********************100%***********************]  1 of 1 completed
['ADANIPORTS.NS' 0]
[*********************100%***********************]  1 of 1 completed
['ABFRL.NS' 0]
[*********************100%***********************]  1 of 1 completed
['APLLTD.NS' 0]
[*********************100%***********************]  1 of 1 completed
['ALKEM.NS' 0]
[*********************100%***********************]  1 of 1 completed
['AMARAJABAT.NS' 0]
[*********************100%***********************]  1 of 1 completed
['AMBUJACEM.NS' 0]
[*********************100%***********************]  1 of 1 completed
['APOLLOHOSP.NS' 0]
[*********************100%***********************]  1 of 1 completed
['APOLLOTYRE.NS' 0]
[*********************100%*

Unnamed: 0,Stock,Coefficient
0,AARTIIND.NS,17.329384
1,ABBOTINDIA.NS,18.657327
2,ACC.NS,2.717577
3,ADANIENT.NS,116.067437
4,ADANIPORTS.NS,6.515773
...,...,...
173,VEDL.NS,0.394223
174,IDEA.NS,-3.621158
175,VOLTAS.NS,6.554686
176,WIPRO.NS,12.101708


In [9]:
from datetime import date

import datetime



# # Considering data from 2000 to 2022
# start = datetime.datetime(2018, 1, 1)
# end = date.today().strftime("%Y-%m-%d")

stocks = Path('./../resources/stock_list.txt').read_text().split("\n")
index=1
for s in stocks:
    print(s)
    df = yf.download( s, start , end)
    close = df['Close']

    close_df = close.reset_index().rename(columns={'Date':'ds', 'Close':'y'})
    close_df['y'] = np.log(close_df['y'])
    model = Prophet(daily_seasonality=True)
    model.fit(close_df)
    pkl_path = "./../resources/fbprophet/model/"+s+".model"
    with open(pkl_path, "wb") as f:
        pickle.dump(model, f)

    #     # save the dataframe
    #     forecast.to_pickle("./../../resources/fbprophet/forecast/"+s+".forecast")
    print(index,"*** Data Saved for -> ",s)
    index = index+1


AARTIIND.NS
[*********************100%***********************]  1 of 1 completed
Initial log joint probability = -2.23477
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
      99       4345.96   0.000629705       576.598           1           1      121   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     199       4374.72   0.000268121       833.207           1           1      234   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     299       4393.01    0.00138763       1133.71           1           1      354   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     399       4401.74   0.000945255       1753.78           1           1      477   
    Iter      log prob        ||dx||      ||grad||       alpha      alpha0  # evals  Notes 
     499       4415.77   0.000318844          1804      0.9174      0.