In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./data/raw_data.csv")
df.head()

Unnamed: 0,stock,gap_date,gap_width,gap_day_pl,revert_time,u_curve_effect,u_curve_profit,lowest_day,market_change_20d,Sector
0,MMM,2019-04-25,-0.098324,-0.034525,31,False,,,0.040415,Industrials
1,AOS,2001-10-01,-0.147274,-0.004,1,True,0.07848,0.0,-0.102177,Industrials
2,AOS,2008-12-01,-0.030463,-0.089015,2,True,0.058008,1.0,-0.154459,Industrials
3,AYI,2008-07-02,-0.094141,-0.060634,26,True,0.156382,6.0,-0.099274,Industrials
4,AYI,2018-01-09,-0.116989,-0.03689,8,True,0.055007,1.0,0.036177,Industrials


In [3]:
# one hot encoding Sector variable
sector = pd.get_dummies(df["Sector"])
df = pd.concat([df, sector], axis = 1)
df.drop(["Sector"], inplace=True, axis=1)

In [4]:
# extract month info
df["gap_date"] = pd.to_datetime(df["gap_date"])
df["month"] = df["gap_date"].dt.month
df.head()

Unnamed: 0,stock,gap_date,gap_width,gap_day_pl,revert_time,u_curve_effect,u_curve_profit,lowest_day,market_change_20d,Consumer Discretionary,...,Energy,Financials,Health Care,Industrials,Information Technology,Materials,Real Estate,Telecommunication Services,Utilities,month
0,MMM,2019-04-25,-0.098324,-0.034525,31,False,,,0.040415,0,...,0,0,0,1,0,0,0,0,0,4
1,AOS,2001-10-01,-0.147274,-0.004,1,True,0.07848,0.0,-0.102177,0,...,0,0,0,1,0,0,0,0,0,10
2,AOS,2008-12-01,-0.030463,-0.089015,2,True,0.058008,1.0,-0.154459,0,...,0,0,0,1,0,0,0,0,0,12
3,AYI,2008-07-02,-0.094141,-0.060634,26,True,0.156382,6.0,-0.099274,0,...,0,0,0,1,0,0,0,0,0,7
4,AYI,2018-01-09,-0.116989,-0.03689,8,True,0.055007,1.0,0.036177,0,...,0,0,0,1,0,0,0,0,0,1


In [5]:
# train test split
train = df[df["gap_date"] < "2017-01-01"]
test = df[df["gap_date"] >= "2017-01-01"]
print("train size: ", train.shape)
print("test size: ", test.shape)

train size:  (318, 21)
test size:  (44, 21)


In [6]:
# set training label
train_effect_label = train["u_curve_effect"]
train_time_label = train["revert_time"]
# train_lowest_label = train["lowest_day"]
print(train["u_curve_effect"].value_counts())

# remove unnecessary columns
column_index = [2,3] + list(range(8, 21))
train = train.iloc[:, column_index]
train.head()

True     224
False     94
Name: u_curve_effect, dtype: int64


Unnamed: 0,gap_width,gap_day_pl,market_change_20d,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology,Materials,Real Estate,Telecommunication Services,Utilities,month
1,-0.147274,-0.004,-0.102177,0,0,0,0,0,1,0,0,0,0,0,10
2,-0.030463,-0.089015,-0.154459,0,0,0,0,0,1,0,0,0,0,0,12
3,-0.094141,-0.060634,-0.099274,0,0,0,0,0,1,0,0,0,0,0,7
5,-0.09838,-0.038603,-0.08558,0,0,0,0,0,0,1,0,0,0,0,6
6,-0.179469,-0.114894,-0.060016,0,0,0,0,0,0,1,0,0,0,0,7


In [7]:
# set testing label
test_effect_label = test["u_curve_effect"]
test_time_label = test["revert_time"]
# train_lowest_label = train["lowest_day"]
print(test["u_curve_effect"].value_counts())

# remove unnecessary columns
column_index = [2,3] + list(range(8, 21))
test = test.iloc[:, column_index]
test.head()

True     29
False    15
Name: u_curve_effect, dtype: int64


Unnamed: 0,gap_width,gap_day_pl,market_change_20d,Consumer Discretionary,Consumer Staples,Energy,Financials,Health Care,Industrials,Information Technology,Materials,Real Estate,Telecommunication Services,Utilities,month
0,-0.098324,-0.034525,0.040415,0,0,0,0,0,1,0,0,0,0,0,4
4,-0.116989,-0.03689,0.036177,0,0,0,0,0,1,0,0,0,0,0,1
10,-0.113784,0.004157,-0.015171,0,0,0,0,1,0,0,0,0,0,0,5
16,-0.1376,-0.020408,0.015785,0,0,0,0,0,0,1,0,0,0,0,5
26,-0.078453,-0.028534,0.019995,0,0,0,0,1,0,0,0,0,0,0,8


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [40]:
m1 = LogisticRegression(solver = "liblinear", penalty = 'l1')

m1.fit(train, train_effect_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [43]:
pred_effect_exist = m1.predict_proba(test)
test_result = pd.DataFrame({
    "actual":test_effect_label.astype(int),
    "pred":pred_effect_exist[:, 1]
})
test_result.head()

Unnamed: 0,actual,pred
0,0,0.591137
4,1,0.561285
10,1,0.692872
16,0,0.721639
26,1,0.644799


In [42]:
auc = roc_auc_score(test_result["actual"], test_result["pred"])
print(auc)

# AUC = 0.66, I still don't think it is a good result since the test samples are too small
# Maybe we should created more variables...

0.6666666666666667


In [61]:
coef = pd.DataFrame({
    "variables": train.columns,
    "coef": m1.coef_[0].T
})
print(coef)
# from the coef we can find gap day profit/loss is the most important variable while gap_width, market_change_20d are useless

                     variables      coef
0                    gap_width  0.000000
1                   gap_day_pl  9.687690
2            market_change_20d  0.000000
3       Consumer Discretionary  0.000000
4             Consumer Staples  0.078453
5                       Energy  0.000000
6                   Financials  0.180399
7                  Health Care -0.377003
8                  Industrials -0.414058
9       Information Technology  0.000000
10                   Materials  0.756443
11                 Real Estate  0.674115
12  Telecommunication Services  0.000000
13                   Utilities  0.000000
14                       month  0.033124


In [None]:
# To be continued