<a href="https://colab.research.google.com/github/bsun1220/UCBTradingComp2022/blob/main/UCBerkeley2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
import glob
import zipfile
import functools

# Upload the API token.
def get_kaggle_credentials():
  token_dir = os.path.join(os.path.expanduser("~"),".kaggle")
  token_file = os.path.join(token_dir, "kaggle.json")
  if not os.path.isdir(token_dir):
    os.mkdir(token_dir)
  try:
    with open(token_file,'r') as f:
      pass
  except IOError as no_file:
    try:
      from google.colab import files
    except ImportError:
      raise no_file
  uploaded = files.upload()
  if "kaggle.json" not in uploaded:
      raise ValueError("You need an API key! see: "
          "https://github.com/Kaggle/kaggle-api#api-credentials")
  with open(token_file, "wb") as f:
    f.write(uploaded["kaggle.json"])
  os.chmod(token_file, 600)

get_kaggle_credentials()

Saving kaggle.json to kaggle.json


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

  import pandas.util.testing as tm


In [4]:
!kaggle competitions download -c berkeleytradingcompetition2022

Downloading sample.csv to /content
  0% 0.00/770k [00:00<?, ?B/s]
100% 770k/770k [00:00<00:00, 52.7MB/s]
Downloading test.csv.zip to /content
  0% 0.00/3.31M [00:00<?, ?B/s]
100% 3.31M/3.31M [00:00<00:00, 30.3MB/s]
Downloading train.csv.zip to /content
 44% 5.00M/11.4M [00:00<00:00, 50.2MB/s]
100% 11.4M/11.4M [00:00<00:00, 55.4MB/s]


In [5]:
!unzip test.csv.zip
!unzip train.csv.zip

Archive:  test.csv.zip
  inflating: test.csv                
Archive:  train.csv.zip
  inflating: train.csv               


In [29]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [32]:
def create_cols(train_df, test_df):
  train_df["adj sent"] = np.abs(train_df["sm sentiment"] * train_df["sm sentiment"])
  test_df["adj sent"] = np.abs(test_df["sm sentiment (naive)"] * train_df["sm sentiment (naive)"])

  train_df["sqrtvol"] = np.sqrt(train_df["price"] * train_df["size"])
  test_df["sqrtvol"] = np.sqrt(test_df["price"] * test_df["size"])

  train_df = pd.get_dummies(train_df, columns=["location", "stock", "counterparty"])
  test_df = pd.get_dummies(test_df, columns = ["location", "stock", "counterparty"])

  train_df = train_df.drop(["price", "size", "dir", "counterparty (naive)", "sm sentiment", "sm sentiment (naive)", "pnl", "trade id", "realized edge"], axis = 1)
  test_df = test_df.drop(["price", "size", "dir", "counterparty (naive)", "sm sentiment (naive)", "trade id"], axis = 1)
  return train_df, test_df

In [33]:
df1, df2 = create_cols(train, test)

In [39]:
def create_splits(train_df, test_df):
  df_1 = train_df[train_df["counterparty_R"] == 1]
  df_1 = df_1.drop(["counterparty_HF", "counterparty_MF", "counterparty_R"], axis = 1)
  df_2 = train_df[train_df["counterparty_R"] == 0]
  df_2 = df_2.drop(["counterparty_R", "adj sent"], axis = 1)

  df_3 = test_df[test_df["counterparty_R"] == 1]
  df_3 = df_3.drop(["counterparty_HF", "counterparty_MF", "counterparty_R"], axis = 1)
  df_4 = test_df[test_df["counterparty_R"] == 0]
  df_4 = df_4.drop(["counterparty_R", "adj sent"], axis = 1)
  return df_1, df_2, df_3, df_4

In [40]:
R_train, not_R_train, R_test, not_R_test = create_splits(df1, df2)


In [45]:
R_key = train[train["counterparty"] == "R"]
NR_key = train[train["counterparty"] != "R"]

In [52]:
test_r_key = test[test["counterparty"] == "R"]
test_nr_key = test[test["counterparty"] != "R"]

In [44]:
R_train

Unnamed: 0,pred edge,adj sent,sqrtvol,location_CSE,location_NASDAQ,location_NYSE,stock_A,stock_B,stock_C,stock_D
2,12.306395,760.220970,26.433492,0,0,1,0,1,0,0
8,2.159811,248.416971,72.632042,0,1,0,1,0,0,0
13,14.221260,5491.414903,102.461396,1,0,0,0,0,1,0
18,2.140908,172.575585,148.747539,0,0,1,1,0,0,0
23,1.969855,1184.422049,90.006718,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
199983,12.401104,90.305019,102.478813,0,0,1,0,0,1,0
199993,5.957446,246.956470,94.285322,0,0,1,0,1,0,0
199995,1.035089,2235.659501,120.611226,0,0,1,0,1,0,0
199997,8.910650,9222.857242,63.561536,0,0,1,0,0,0,1


In [46]:
X = R_train
X = sm.add_constant(X)
R_model = sm.OLS(R_key['realized edge'], X).fit()

X = not_R_train
X = sm.add_constant(X)
NR_model = sm.OLS(NR_key['realized edge'], X).fit()


  x = pd.concat(x[::order], 1)


In [47]:
Y = R_test
Y = sm.add_constant(Y)
Rpredictions = R_model.predict(Y)

Y = not_R_test
Y = sm.add_constant(Y)
NRpredictions = NR_model.predict(Y)


  x = pd.concat(x[::order], 1)


In [62]:
test_r_key["realized edge"] = Rpredictions
test_r_key = test_r_key[["trade id", "realized edge"]]

test_nr_key["realized edge"] = NRpredictions
test_nr_key = test_nr_key[["trade id", "realized edge"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [70]:
combined = test_r_key.append(test_nr_key)
combined = combined.sort_values(["trade id"])
combined = combined.drop(["trade id"], axis = 1)
combined.index.rename("trade id", inplace = True)
combined.to_csv("submission.csv")

In [71]:
!kaggle competitions submit -c berkeleytradingcompetition2022 -f submission.csv -m "Message"

100% 2.36M/2.36M [00:00<00:00, 5.95MB/s]
Successfully submitted to Berkeley Trading Competition Yosemite

In [72]:
class linearRegression(nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.fc1 = torch.nn.Linear(inputSize, inputSize)
        self.sigmoid = nn.Sigmoid()
        self.fc2 = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
      out = self.fc1(x)
      out = self.sigmoid(out)
      out = self.fc2(out)
      return out

In [73]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device
    
def df_to_tensor(df):
    device = get_device()
    return torch.from_numpy(df.values).float().to(device)

In [74]:
x = df_to_tensor(df1)
y = df_to_tensor(train["realized edge"])

In [75]:
model = linearRegression(13, 1)

In [76]:
learning_rate = 0.01
l = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [None]:
num_epochs = 25
for epoch in range(num_epochs):
  y_pred = model(x)
  loss = l(y_pred, y)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()

  return F.mse_loss(input, target, reduction=self.reduction)


In [30]:
counter_party = pd.read_csv("test_counterparty.csv")

In [31]:
test["counterparty"] = counter_party["counterparty"]

In [16]:
test

Unnamed: 0,trade id,pred edge,stock,location,price,size,dir,counterparty (naive),sm sentiment (naive),counterparty
0,0,11.785579,C,NYSE,14.727382,1346.0,B,MF,9.119066,MF
1,1,8.490506,C,CSE,15.016002,882.0,S,MF,-48.652384,R
2,2,0.687709,C,NYSE,15.169420,423.0,B,MF,3.536848,MF
3,3,7.352720,D,CSE,2026.572586,8.0,B,MF,48.257293,MF
4,4,2.995660,C,NYSE,15.027723,523.0,B,MF,-66.590908,MF
...,...,...,...,...,...,...,...,...,...,...
99995,99995,18.867160,D,NYSE,1995.388892,1.0,B,HF,-85.018229,R
99996,99996,11.175501,B,NASDAQ,99.420082,51.0,S,MF,43.210479,MF
99997,99997,2.351071,A,NASDAQ,40.410974,270.0,S,HF,29.156088,HF
99998,99998,0.043775,C,NYSE,15.090556,787.0,B,MF,-91.163595,MF


In [27]:
test

Unnamed: 0,trade id,pred edge,stock,location,price,size,dir,counterparty (naive),sm sentiment (naive),counterparty,adj sent,sqrtvol
0,0,11.785579,C,NYSE,14.727382,1346.0,B,MF,9.119066,MF,-813.126840,140.794376
1,1,8.490506,C,CSE,15.016002,882.0,S,MF,-48.652384,R,-4056.617908,115.083072
2,2,0.687709,C,NYSE,15.169420,423.0,B,MF,3.536848,MF,97.518335,80.104086
3,3,7.352720,D,CSE,2026.572586,8.0,B,MF,48.257293,MF,-1932.269816,127.328633
4,4,2.995660,C,NYSE,15.027723,523.0,B,MF,-66.590908,MF,-1503.332306,88.653816
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,18.867160,D,NYSE,1995.388892,1.0,B,HF,-85.018229,R,-1864.107398,44.669776
99996,99996,11.175501,B,NASDAQ,99.420082,51.0,S,MF,43.210479,MF,1271.946738,71.206911
99997,99997,2.351071,A,NASDAQ,40.410974,270.0,S,HF,29.156088,HF,365.067415,104.455555
99998,99998,0.043775,C,NYSE,15.090556,787.0,B,MF,-91.163595,MF,5202.894102,108.978288
