In [1]:
import pandas as pd
import numpy as np
from numpy import genfromtxt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix

In [2]:
# load the data

X_train_std = pd.read_csv("data/X_train_std.csv")
y_train_encoded = genfromtxt('data/y_train_encoded.csv', delimiter=',')

X_test_std = pd.read_csv("data/X_test_std.csv")
y_test_encoded = genfromtxt('data/y_test_encoded.csv', delimiter=',')

judge_data = pd.read_csv("data/stock_judge_data.csv")
judge_data_std = pd.read_csv("data/judge_data_std.csv")

In [3]:
X_train_std.head()

Unnamed: 0,roa,ocf,roa_change,accruals,change_leverage,change_curr_ratio,change_shares,change_gross_margin,change_asset_turnover
0,0.054219,-0.030252,-0.076924,-0.03038,-0.016244,-0.005994929,-0.021087,-0.006206,-0.13763
1,0.066049,-0.030197,0.168624,-0.030325,-0.016381,-0.00900443,-0.0194,-0.00066,2.307142
2,0.055584,-0.030052,-0.009956,-0.03018,-0.016324,2.641278e-18,-0.019268,-0.000352,0.049789
3,0.050124,-0.025236,-0.013756,-0.025364,-0.033483,2.641278e-18,-0.021109,-0.000352,0.037294
4,0.119741,-0.030018,-0.02088,-0.030146,-0.016244,0.0300715,-0.021068,-0.0022,-0.200102


In [4]:
# drop the symbols from the judgement data
judge_data_std.drop(columns='symbol', inplace=True)

In [5]:
# fit logistic regression on the training data
model = LogisticRegression()
model.fit(X_train_std, y_train_encoded)

LogisticRegression()

In [6]:
# get feature coefficients
features = ["roa", "ocf", "roa_change", "accruals", "change_leverage", "change_curr_ratio", "change_shares",
            "change_gross_margin", "change_asset_turnover"]
coef = model.coef_
feat_coef = [(features[i], coef[0][i]) for i in range(len(features))]

In [7]:
# sort features by absolute value of coefficients in descending order
feat_coef_sorted = sorted(feat_coef, key=lambda row: np.abs(row[1]), reverse=True)
print("Feature coefficients in descending order:\n")
for entry in feat_coef_sorted:
    print(entry[0] + ": " + str(entry[1]))

Feature coefficients in descending order:

roa: 0.2553430333440398
change_shares: -0.09505043595395439
change_asset_turnover: -0.050392548797710546
change_curr_ratio: -0.03661885016248814
accruals: -0.014281451069728102
ocf: -0.013359691558674147
roa_change: 0.0048016457767721
change_leverage: -0.0011928556142102918
change_gross_margin: 0.0007779274915240857


In [9]:
# predict the testing data with the model
test_pred = model.predict(X_test_std)

In [10]:
# accuracy
accuracy = accuracy_score(y_test_encoded, test_pred)
print("Accuracy: " + str(accuracy))

# precision
precision = precision_score(y_test_encoded, test_pred)
print("Precision: " + str(precision))

# npv
tn, fp, fn, tp = confusion_matrix(y_test_encoded, test_pred).ravel()
npv = tn / (tn + fn)
print("NPV: " + str(npv))

# recall
recall = recall_score(y_test_encoded, test_pred)
print("Recall: " + str(recall))

# F1
f1 = f1_score(y_test_encoded, test_pred)
print("F1: " + str(f1))

# MCC
mcc = matthews_corrcoef(y_test_encoded, test_pred)
print("MCC: " + str(mcc))

Accuracy: 0.5580379430853719
Precision: 0.5575749872902898
NPV: 0.5833333333333334
Recall: 0.9865077580391275
F1: 0.7124644742184327
MCC: 0.037669228779321474


In [11]:
# get the positive test prediction probabilities
test_prob = model.predict_proba(X_test_std)[:, 1]

In [12]:
# construct a dataframe of actual, predicted, and probability values for the test data
test_pred_prob = pd.DataFrame({'actual': y_test_encoded, 'predicted': test_pred, 'probability': test_prob})
test_pred_prob

Unnamed: 0,actual,predicted,probability
0,1.0,1.0,0.553848
1,1.0,1.0,0.543151
2,0.0,1.0,0.553169
3,0.0,1.0,0.534829
4,1.0,1.0,0.553479
...,...,...,...
8007,0.0,1.0,0.529193
8008,0.0,1.0,0.552917
8009,1.0,1.0,0.624332
8010,0.0,1.0,0.558054


In [13]:
# predict the judgement data with the model
judge_pred = model.predict_proba(judge_data_std)

In [14]:
# add the predictions to judgement data
judge_data['probability'] = judge_pred[:, 1]

In [15]:
# sort the judgement data in descending order
judge_data.sort_values(by=['probability'], ascending=False, inplace=True, ignore_index=True)

In [16]:
# show the top stock predictions (most likely to increase in value over the next year)
judge_data.head(10)

Unnamed: 0,symbol,roa,ocf,roa_change,accruals,change_leverage,change_curr_ratio,change_shares,change_gross_margin,change_asset_turnover,probability
0,AMRK,0.003,-544000.0,0.008,-544000.0,32198000,0.174,0.0,0.003,-5.304,0.789952
1,HQI,-0.007,2784031.0,-0.049,2784031.0,0,-1.454,8907141.0,0.744,-3.828,0.73347
2,HFFG,0.02,8277366.0,-0.077,8277366.0,72927014,-0.387,29977610.0,-0.007,-3.053,0.699934
3,SIGA,-0.036,-2322143.0,-2.808,-2322143.0,4497269,-8.735,330994.0,0.133,-3.003,0.697323
4,AM,-0.054,-398493000.0,-1.477,-398493000.0,2892249000,0.275,297848678.0,-0.247,-3.194,0.689928
5,FVE,-0.017,-20274000.0,0.163,-20274000.0,-29280000,0.272,70377.0,-0.014,-2.174,0.660536
6,SKY,-0.086,-37890000.0,-0.086,-37890000.0,83339000,-0.486,48265947.0,0.087,-2.198,0.657203
7,INTZ,0.705,4507000.0,-0.116,4506999.0,-274000,1.459,237794.0,-0.018,-1.706,0.657044
8,TA,0.01,65284000.0,0.086,65284000.0,1993051000,-1.154,227069.0,0.164,-2.054,0.655674
9,IDEX,-0.665,-63478589.0,-0.468,-63478590.0,-3167905,-1.303,27964086.0,0.959,-2.416,0.653381


In [17]:
# show the bottom stock predictions (most likely to decrease in value over the next year)
judge_data.tail(10)

Unnamed: 0,symbol,roa,ocf,roa_change,accruals,change_leverage,change_curr_ratio,change_shares,change_gross_margin,change_asset_turnover,probability
4423,TALK,-3.725,-333817.0,0.9,-333813.3,442640,-0.042,1673534000.0,-2.055,0.586,0.321465
4424,KULR,-4.044,-2081588.0,-2.826,-2081584.0,0,-1.745,1526105.0,0.401,2.385,0.317514
4425,GMBL,-8.405,-3305193.0,-5.463,-3305185.0,290720,,66235.0,,0.0,0.31473
4426,VERB,-10.378,-7792000.0,0.0,-7791990.0,0,0.0,0.0,0.0,0.0,0.27
4427,TLK,0.132,38462000000000.0,-0.041,38462000000000.0,8610000000000,-0.123,-1737780000.0,-0.051,-0.032,0.268866
4428,HGEN,-11.991,-8944000.0,-5.585,-8943988.0,2992000,-0.124,4191900.0,,0.0,0.232237
4429,BTCY,-20.582,-8592065.0,-14.208,-8592044.0,867699,-0.983,5364952.0,,0.358,0.092073
4430,ENIA,0.056,3917446000.0,0.005,3917446000.0,116587000,0.315,18633670000.0,0.021,-0.053,0.013792
4431,ENIC,0.039,710198000000.0,-0.016,710198000000.0,-186397316000,0.159,20073780000.0,0.013,-0.012,0.009403
4432,ZIVO,-70.261,-8352350.0,-56.378,-8352280.0,-12874278,0.024,222162300.0,,0.0,0.000293


In [18]:
# save judgement predictions
judge_data.to_csv('data/judge_data_pred.csv', index=False)