# Logistic Regression Part 2

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import yfinance as yf
yf.pdr_override()

In [13]:
# input
symbol = 'ACC.NS'
start = '2014-01-01'
end = '2018-08-27'

# Read data 
dataset = yf.download(symbol,start,end)

# Only keep close columns 
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-01,1110.0,1115.0,1101.300049,1108.449951,980.100525,66461
2014-01-02,1110.099976,1122.150024,1093.0,1097.599976,970.506714,310453
2014-01-03,1086.050049,1094.5,1081.150024,1088.949951,962.858521,186172
2014-01-06,1088.0,1094.599976,1071.099976,1085.0,959.365845,144416
2014-01-07,1087.599976,1094.199951,1071.199951,1085.199951,959.542542,297558


In [14]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()

In [15]:
dataset.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-08-17,1562.199951,1601.650024,1553.5,1593.949951,1522.762573,664502,1,1,1,0.022385
2018-08-20,1601.900024,1627.550049,1586.949951,1599.300049,1527.873779,921054,1,1,1,0.003357
2018-08-21,1604.0,1650.949951,1604.0,1639.800049,1566.565063,1068497,0,1,0,0.025324
2018-08-23,1640.199951,1647.949951,1603.0,1635.25,1562.21814,755388,0,0,1,-0.002775
2018-08-24,1640.0,1657.0,1629.199951,1640.849976,1567.568115,597273,0,0,0,0.003425


In [16]:
# Define X
X = np.asarray(dataset[['Open', 'High', 'Low', 'Adj Close', 'Volume']])
X[0:5]

array([[  1110.09997559,   1122.15002441,   1093.        ,
           970.50671387, 310453.        ],
       [  1086.05004883,   1094.5       ,   1081.15002441,
           962.85852051, 186172.        ],
       [  1088.        ,   1094.59997559,   1071.09997559,
           959.36584473, 144416.        ],
       [  1087.59997559,   1094.19995117,   1071.19995117,
           959.5425415 , 297558.        ],
       [  1080.09997559,   1098.        ,   1077.09997559,
           957.90686035, 309014.        ]])

In [17]:
# Define y
y = np.asarray(dataset['Buy_Sell'])
y[0:5]

array([0, 0, 1, 0, 0])

In [18]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-2.12461871, -2.14078382, -2.14303927, -2.22186131, -0.12040042],
       [-2.26255539, -2.29859208, -2.21169894, -2.26603152, -0.49461284],
       [-2.25137158, -2.29802148, -2.2699297 , -2.28620258, -0.62034093],
       [-2.25366589, -2.30030456, -2.26935044, -2.28518212, -0.15922751],
       [-2.29668162, -2.27861637, -2.23516524, -2.29462858, -0.12473328]])

In [19]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (858, 5) (858,)
Test set: (287, 5) (287,)


In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [22]:
LR.predict(X[:2, :])

array([1, 1])

In [23]:
LR.score(X_train, y_train)

0.5384615384615384