In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from gurobipy import Model, GRB

In [2]:
preds = pd.read_csv('predict_result.csv',index_col = 0)

In [3]:
## weights 

preds['w-avg'] = (0.3*preds['1_day']+0.3*preds['3_day']+0.4*preds['5_day'])

In [4]:
preds.fillna(0,inplace=True)
preds

Unnamed: 0,day,stock,1_day,3_day,5_day,w-avg
0,18,600000,1,0,1,0.7
1,18,600010,1,1,1,1.0
2,18,600015,0,1,1,0.7
3,18,600016,0,0,0,0.0
4,18,600018,1,1,1,1.0
...,...,...,...,...,...,...
36945,756,601857,1,0,1,0.7
36946,756,601901,1,0,0,0.3
36947,756,601988,1,1,1,1.0
36948,756,601989,1,0,1,0.7


### Get daily returns and closing prices

In [5]:
files = [600000,600010,600015,600016,600018,600028,600030,600036,600048,600050,600089,600104,600109,600111,600150,600256,600406,
        600518,600519,600583,600585,600637,600690,600837,600887,600893,600958,600999,601006,601088,601166,601169,601186,601288,
        601318,601328,601390,601398,601601,601628,601668,601688,601766,601800,601818,601857,601901,601988,601989,601998]

In [6]:
days=[]
stocks=[]
returns=[]
closes=[]

for d in range(18,757):
    for file in files:
        df=pd.read_csv('Regression responses/SH'+str(file)+'_lr_1.csv')
        r=df.loc[df.day==d,'return_1'].values[0]
        c=df.loc[df.day==d,'Close'].values[0]
        
        days.append(d)
        stocks.append(file)
        returns.append(r)
        closes.append(c)

returns_df=pd.DataFrame(dict(zip(['day','stock','close','return_1'],[days,stocks,closes,returns])))

In [7]:
returns_df

Unnamed: 0,day,stock,close,return_1
0,18,600000,154.4560,-0.002395
1,18,600010,17.6114,-0.023103
2,18,600015,37.7721,-0.010821
3,18,600016,181.0807,-0.018927
4,18,600018,6.6625,-0.005820
...,...,...,...,...
36945,756,601857,7.6917,0.001745
36946,756,601901,8.5336,0.048405
36947,756,601988,6.6641,0.005462
36948,756,601989,8.7825,0.001928


### Covariance matrix

In [9]:
pivot_returns = returns_df.pivot(index='day', columns='stock', values='return_1')
pivot_returns

stock,600000,600010,600015,600016,600018,600028,600030,600036,600048,600050,...,601668,601688,601766,601800,601818,601857,601901,601988,601989,601998
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
18,-0.002395,-0.023103,-0.010821,-0.018927,-0.005820,0.011004,0.004507,-0.003289,0.000000,0.009313,...,-0.005613,0.000554,0.002013,0.004249,-0.007521,0.001318,-0.027908,-0.005738,-0.014020,0.009014
19,-0.008404,-0.003378,-0.003648,-0.007503,-0.011722,0.001818,-0.026360,-0.009351,-0.004273,0.016169,...,-0.027118,-0.008852,-0.013058,-0.016211,-0.002539,0.011825,-0.032293,0.000000,0.006315,-0.013400
20,-0.014527,0.023730,-0.010977,-0.001080,0.000000,-0.010871,-0.029378,-0.012215,-0.013949,-0.031824,...,0.016260,-0.000558,-0.002035,0.001432,-0.010143,-0.009085,0.000000,-0.011561,0.043963,-0.025640
21,0.002457,-0.019868,0.002775,-0.001081,0.007913,0.018320,-0.004154,0.001125,0.005441,-0.016420,...,0.014852,0.016197,0.018357,0.038627,0.007675,0.039320,0.002469,0.011697,0.054133,0.015481
22,-0.007965,0.003378,-0.006460,-0.006493,-0.001959,0.014388,0.006556,-0.005054,-0.005411,0.011926,...,-0.011261,-0.006595,0.002996,0.032370,-0.007617,0.029001,-0.008633,-0.002904,-0.017115,-0.001518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,0.000806,-0.007515,-0.003910,0.000000,-0.006823,-0.001975,-0.007158,-0.006090,0.008827,0.000000,...,0.000000,-0.001029,-0.005596,-0.019624,-0.011154,-0.003430,0.100001,0.002709,-0.003779,-0.009639
753,-0.017713,-0.022735,-0.010472,-0.007936,-0.024007,-0.011880,-0.022476,-0.003729,-0.022500,-0.013600,...,-0.022012,-0.024769,-0.015500,-0.026696,-0.009024,-0.015528,0.037132,-0.008104,-0.015185,-0.004877
754,0.006558,0.015506,0.003967,0.001600,0.001760,0.006014,0.005639,0.016310,-0.014706,0.000000,...,0.001869,0.017990,0.001429,0.001142,-0.002276,0.003510,-0.020991,0.000000,0.005784,0.000000
755,-0.003257,-0.007630,-0.005270,-0.004792,-0.005270,-0.001997,-0.002157,-0.005788,-0.007138,0.003444,...,0.001876,-0.001559,-0.004281,-0.010276,-0.002298,-0.001742,-0.010082,-0.002723,-0.003830,-0.004890


In [36]:
pivot_returns.cov().head()

stock,600000,600010,600015,600016,600018,600028,600030,600036,600048,600050,...,601668,601688,601766,601800,601818,601857,601901,601988,601989,601998
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
600000,0.000159,6.5e-05,0.000108,0.0001,6.9e-05,7.8e-05,1.2e-05,0.000139,0.000121,8.1e-05,...,9.4e-05,0.000145,8.4e-05,9.2e-05,0.000107,6.5e-05,0.000102,7.8e-05,6.2e-05,0.000113
600010,6.5e-05,0.000354,7e-05,5.8e-05,8.7e-05,5.8e-05,4.1e-05,7e-05,9.9e-05,9.7e-05,...,8.3e-05,0.000134,8.9e-05,0.000113,5.8e-05,6.3e-05,0.000115,5.2e-05,9.8e-05,6.2e-05
600015,0.000108,7e-05,0.00013,9.7e-05,7.8e-05,8.1e-05,1.3e-05,0.000129,0.000129,9.4e-05,...,0.000103,0.000149,8.8e-05,0.000107,0.000107,7e-05,0.000112,8.3e-05,7.3e-05,0.000121
600016,0.0001,5.8e-05,9.7e-05,0.000104,6.4e-05,7.1e-05,1.1e-05,0.000113,0.00012,7.9e-05,...,9.1e-05,0.000131,8.6e-05,9.3e-05,9.5e-05,6.4e-05,9.8e-05,7.4e-05,6.4e-05,0.000104
600018,6.9e-05,8.7e-05,7.8e-05,6.4e-05,0.000416,7.7e-05,2.2e-05,9.5e-05,0.000101,0.000126,...,9.7e-05,0.000151,9.5e-05,0.000125,7.9e-05,6.7e-05,0.000124,6.1e-05,9.9e-05,8.3e-05


### Demo on day 505 morning

In [8]:
## say we are at 505 morning, trying to decide which stocks to invest before close comes out

d_505 = preds[preds.day == 505].sort_values('w-avg', ascending=False)
d_505.head()

Unnamed: 0,day,stock,1_day,3_day,5_day,w-avg
24375,505,600893,1,1,1,1.0
24365,505,600256,1,1,1,1.0
24397,505,601988,1,1,1,1.0
24394,505,601818,1,1,1,1.0
24389,505,601628,1,1,1,1.0


In [9]:
top10=d_505.head(10)['stock'].values
top10

array([600893, 600256, 601988, 601818, 601628, 601398, 601288, 601088,
       601006, 600887])

In [10]:
bottom10=d_505.tail(10)['stock'].values
bottom10

array([600109, 600015, 601186, 601766, 600030, 601668, 600089, 600837,
       600999, 600637])

In [11]:
mean=[]
std=[]
for s in top10:
    temp=returns_df[(returns_df.stock==s)&(returns_df.day<=504)]
    mean.append(np.mean(temp['return_1'].values))
    std.append(np.std(temp['return_1'].values))
    
mean=pd.Series(mean,index=top10)
std=pd.Series(std,index=top10)

In [14]:
cov = pivot_returns.loc[:504].cov()
cov.head()

stock,600000,600010,600015,600016,600018,600028,600030,600036,600048,600050,...,601668,601688,601766,601800,601818,601857,601901,601988,601989,601998
stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
600000,0.00015,5.2e-05,0.000112,9.9e-05,4.2e-05,8.2e-05,-9e-06,0.000134,0.000111,6.4e-05,...,8.1e-05,0.000132,7.9e-05,7.8e-05,9.2e-05,6.9e-05,7.7e-05,7.8e-05,4.2e-05,0.000104
600010,5.2e-05,0.000337,5.2e-05,4.3e-05,4.9e-05,3.9e-05,1e-05,6.3e-05,8.4e-05,7.2e-05,...,5.2e-05,7.7e-05,6.9e-05,6.8e-05,4.1e-05,5.2e-05,4.9e-05,4.1e-05,5.5e-05,3.8e-05
600015,0.000112,5.2e-05,0.000139,0.000103,5.8e-05,8.8e-05,-2e-06,0.000137,0.000134,7.8e-05,...,0.000102,0.000139,8.7e-05,9.9e-05,0.000104,7.8e-05,9.2e-05,8.9e-05,4.9e-05,0.000121
600016,9.9e-05,4.3e-05,0.000103,0.000114,4.6e-05,7.7e-05,-3e-06,0.000119,0.000124,6.6e-05,...,8.9e-05,0.000127,8.8e-05,8.5e-05,9.3e-05,7.2e-05,8.4e-05,7.9e-05,4.6e-05,0.000104
600018,4.2e-05,4.9e-05,5.8e-05,4.6e-05,0.000313,6.7e-05,-1e-06,6.8e-05,9.5e-05,8.3e-05,...,7e-05,8.3e-05,6.5e-05,7.7e-05,6.2e-05,5.7e-05,7e-05,5e-05,6.2e-05,6.3e-05


In [12]:
## the covariance miss 2 for different stocks.

mod=Model()
S=top10
u=10
l=3
e=0.01
x=mod.addVars(S,lb=0)
v=mod.addVar(lb=0)
z=mod.addVars(S,vtype=GRB.BINARY)
# R=mod.addVar(lb=0) Don't do this, this will always return the lower bound!
mod.setObjective(sum(mean[s]*x[s] for s in S)-5*v, sense=GRB.MAXIMIZE)
mod.addConstr(sum(x[s] for s in S)==1)
for s in S:
    mod.addConstr(x[s]<=z[s])
    mod.addConstr(x[s]>=z[s]*e)
mod.addConstr(sum(z[s] for s in S)<=u)
mod.addConstr(sum(z[s] for s in S)>=l)
# mod.addConstr(sum(cov.loc[s,k]*x[s]*x[k] for s in S for k in S)<=v)
mod.addConstr(sum(std[s]*std[s]*x[s]*x[s] for s in S)<=v)

mod.setParam('OutputFlag',False)
mod.optimize()   

Using license file /Users/aslanshi/gurobi.lic
Academic license - for non-commercial use only


In [13]:
## we can set a maximum invest ratio say 0.8

init_cap = [1000000] ## money held on morning 505
shares=Counter()

for file in files:
    shares[file]+=0.0

In [14]:
## we first sell low-predicted-performance stocks:
m=0
for s in bottom10:
    temp = returns_df[(returns_df.stock==s)&(returns_df.day==504)]
    m+=shares[s]*temp.close.values[0]
    shares[s]=0
init_cap.append(m+init_cap[-1])

In [15]:
init_cap

[1000000, 1000000.0]

In [16]:
for s in top10:
    temp = returns_df[(returns_df.stock==s)&(returns_df.day==504)]
    shares[s] += x[s].x*0.8*init_cap[-1]/temp.close.values[0]
init_cap[-1]=0.2*init_cap[-1]

In [17]:
init_cap

[1000000, 200000.0]

In [18]:
shares

Counter({600000: 0.0,
         600010: 0.0,
         600015: 0,
         600016: 0.0,
         600018: 0.0,
         600028: 0.0,
         600030: 0,
         600036: 0.0,
         600048: 0.0,
         600050: 0.0,
         600089: 0,
         600104: 0.0,
         600109: 0,
         600111: 0.0,
         600150: 0.0,
         600256: 46.115251235888735,
         600406: 0.0,
         600518: 0.0,
         600519: 0.0,
         600583: 0.0,
         600585: 0.0,
         600637: 0,
         600690: 0.0,
         600837: 0,
         600887: 54.32374810830395,
         600893: 0.0,
         600958: 0.0,
         600999: 0,
         601006: 10716.834715026413,
         601088: 5570.225944119489,
         601166: 0.0,
         601169: 0.0,
         601186: 0,
         601288: 27152.932080318365,
         601318: 0.0,
         601328: 0.0,
         601390: 0.0,
         601398: 19515.71822986735,
         601601: 0.0,
         601628: 0.0,
         601668: 0,
         601688: 0.0,
       

### All test days

In [8]:
## note: this is how we do when stocks are on the same timeline, but in this project they may not
## so we should not do cov

cap = [1000000]
shares=Counter()

for file in files:
    shares[file]+=0

for d in range(505,757):
    
    rank = preds[preds.day == d].sort_values('w-avg', ascending=False)
    top10=rank.head(10)['stock'].values
    bottom40=rank.tail(40)['stock'].values
    
    mean=[]
    std=[]
    for s in top10:
        temp=returns_df[(returns_df.stock==s)&(returns_df.day<=d-1)]
        mean.append(np.mean(temp['return_1'].values))
        std.append(np.std(temp['return_1'].values))

    mean=pd.Series(mean,index=top10)
    std=pd.Series(std,index=top10)
    
#     cov = pivot_returns.loc[:d-1].cov()
    
    mod=Model()
    S=top10
    u=10
    l=3
    e=0.01
    x=mod.addVars(S,lb=0)
    v=mod.addVar(lb=0)
    z=mod.addVars(S,vtype=GRB.BINARY)
    mod.setObjective(sum(mean[s]*x[s] for s in S)-0.1*v, sense=GRB.MAXIMIZE)
    mod.addConstr(sum(x[s] for s in S)==1)
    for s in S:
        mod.addConstr(x[s]<=z[s])
        mod.addConstr(x[s]>=z[s]*e)
    mod.addConstr(sum(z[s] for s in S)<=u)
    mod.addConstr(sum(z[s] for s in S)>=l)
    mod.addConstr(sum(std[s]*std[s]*x[s]*x[s] for s in S)<=v)
#     mod.addConstr(sum(cov.loc[s,k]*x[s]*x[k] for s in S for k in S)<=v)

    mod.setParam('OutputFlag',False)
    mod.optimize()  
    
    ## need to incoporate transaction costs
    
    m=0
    for s in bottom40:
        temp = returns_df[(returns_df.stock==s)&(returns_df.day==d-1)]
        m+=shares[s]*temp.close.values[0]*(1 - 0.00065)
        shares[s]=0
    cap.append(m+cap[-1])
    
    for s in top10:
        temp = returns_df[(returns_df.stock==s)&(returns_df.day==d-1)]
        shares[s] += x[s].x*0.8*(1 - 0.00065)*cap[-1]/temp.close.values[0]
    cap[-1]=0.2*cap[-1]

Using license file /Users/aslanshi/gurobi.lic
Academic license - for non-commercial use only


In [9]:
cap

[1000000,
 200000.0,
 41591.089023145374,
 8318.217804629076,
 1729.6472882582011,
 2270.885943205048,
 157238.02434569693,
 31447.604869139388,
 7796.089943173785,
 1559.2179886347571,
 311.84359772695143,
 62.36871954539029,
 194356.15046719025,
 41942.75637590178,
 8388.551275180356,
 1677.7102550360714,
 335.5420510072143,
 67.10841020144287,
 13.421682040288573,
 8574.046747436154,
 204058.5394869554,
 44337.69858575784,
 45549.558694677355,
 11884.385586335506,
 2834.693698233547,
 214194.62352651983,
 47843.956475397266,
 10065.286023504885,
 2094.827705843479,
 47269.08746380716,
 213172.24381744696,
 210170.62541588626,
 47190.06068472436,
 9830.109306756434,
 1966.021861351287,
 393.2043722702574,
 180.6028724422655,
 36.1205744884531,
 7.224114897690621,
 1.4448229795381242,
 13.770370748334836,
 2.7540741496669674,
 13.952540171087254,
 2.790508034217451,
 0.5581016068434902,
 2.710805815717833,
 2.9356019161619136,
 189738.60839204665,
 39604.852471389444,
 39260.853842445

In [10]:
shares

Counter({600000: 0,
         600010: 0,
         600015: 0,
         600016: 0,
         600018: 0,
         600028: 0.0,
         600030: 0,
         600036: 0,
         600048: 31.741906232860792,
         600050: 0,
         600089: 0,
         600104: 45.636714827114616,
         600109: 0,
         600111: 0,
         600150: 0,
         600256: 0.0,
         600406: 0,
         600518: 0,
         600519: 0,
         600583: 0,
         600585: 0,
         600637: 0,
         600690: 0,
         600837: 0,
         600887: 0,
         600893: 0.0,
         600958: 0,
         600999: 0,
         601006: 0,
         601088: 0,
         601166: 0,
         601169: 0,
         601186: 0.0,
         601288: 0.0,
         601318: 4835.0202411938835,
         601328: 0,
         601390: 0,
         601398: 0,
         601601: 0,
         601628: 245.96060506974626,
         601668: 0,
         601688: 0,
         601766: 0,
         601800: 0,
         601818: 0,
         601857: 0,
  

In [11]:
f=0
for s in files:
    temp = returns_df[(returns_df.stock==s)&(returns_df.day==756)]
    f+=shares[s]*temp.close.values[0]
f+cap[-1]

1261680.0373428594

In [12]:
abs(cap[0]-(f+cap[-1]))

261680.03734285943

### Baseline (equally distributed investing)

In [37]:
cap = 1000000
shares=Counter()

for file in files:
    shares[file]+=0

for s in files:
    temp = returns_df[(returns_df.stock==s)&(returns_df.day==504)]
    shares[s] += (1 - 0.00065)*cap/(temp.close.values[0]*len(files))

f=0
for s in files:
    temp = returns_df[(returns_df.stock==s)&(returns_df.day==756)]
    f+=shares[s]*temp.close.values[0]*(1-0.00065)
f

1134010.02634384

In [38]:
f - cap

134010.02634383994