In [1]:
import os
import numpy as np
import pandas as pd
import argparse

import torch
from torch_geometric.utils import dense_to_sparse
from torch_geometric.data import Data

from torch import nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from sklearn import metrics

# (change this)

In [2]:
# change these variables
DATA_ARCHIVE_DIR = "./data_archive"
news_or_twitter = "twitter"

# convert each .npy file to torch trainable

You need basically four things: 
1. **X** (the nodes' attr: right now is current month avg stock prices). You could modify it at cell 5
2. **y** (the next period of time's stock prices: right now is next month avg stock prices). You could modify it at cell 5
3. **edge_index** (graph edges: right now is based on co-mentions). You could modify it at cell 6
4. **edge_attr** (edges' attr: right now is # of co-mentions during that month). You could modify it at cell 6


**dataset** would be a list of Data object that could be directly used for training.

modify cell 8&9 to update **model architecture** (right now is simple) and **training process** (e.g. threshold, etc.)



# sort the files

In [3]:
# order the .npy files by time 
fps = {}
for fp in os.listdir(os.path.join(DATA_ARCHIVE_DIR,news_or_twitter)):
    if fp.split(".")[-1] == "npy":
        new_name = fp.split(".")[0]
        if len(new_name.split("_")[1]) == 1:
            new_name = new_name.split("_")[0] + "_" + "0" + new_name.split("_")[1]
        fps[fp] = new_name
sorted_fps = [k for k, v in sorted(fps.items(), key=lambda item: item[1])]

# get stock prices

In [4]:
# get the stock prices
stock_df = pd.read_csv(
            os.path.join(DATA_ARCHIVE_DIR,"stock","raw.csv"),
            usecols=["ticker_symbol", "Date", "Close"],
            parse_dates=["Date"],
        )

# Prepare X and y (change this)

In [5]:
X_y = []
for fp in sorted_fps:
    full_fp = os.path.join(DATA_ARCHIVE_DIR,news_or_twitter,fp)
    yr_month = fp.split(".")[0]
    yr = int(yr_month.split("_")[0])
    month = int(yr_month.split("_")[1])
    # no further data available
    if yr == 2021 and month == 10: 
        continue
    
    ######################################################## 
    # prepare X (change this if you want to add SEC emb, etc.)
    ########################################################
    curr = stock_df[(stock_df.Date.dt.year == yr) & (stock_df.Date.dt.month == month)]
    X = curr.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    X_tensor = torch.tensor(X)
    
    ########################################################
    # prepare y (change this if you want to change labels)
    ########################################################
    if month == 12:
        y_yr = yr+1
        y_month = 1
    else:
        y_yr = yr
        y_month = month + 1
    
    nxt = stock_df[(stock_df.Date.dt.year == y_yr) & (stock_df.Date.dt.month == y_month)]
    y = nxt.pivot_table(
            index="Date", columns="ticker_symbol", values="Close"
        ).values.T
    y = (y.mean(1) - X.mean(1)) / X.mean(1)
    y_tensor = torch.tensor(y)
    X_y.append((X_tensor,y_tensor))

# prepare edges index and attr (change this)

In [6]:
edges_lst = []
for fp in sorted_fps:
    full_fp = os.path.join(DATA_ARCHIVE_DIR,news_or_twitter,fp)
    yr_month = fp.split(".")[0]
    yr = int(yr_month.split("_")[0])
    month = int(yr_month.split("_")[1])
    # no further data available
    if yr == 2021 and month == 10: 
        continue
        
        
    edges = np.load(full_fp)
    edge_index, edge_attr = dense_to_sparse(torch.from_numpy(edges))
    ######################################################## 
    # modify edge index and edge attr (change this if you want to change graph structure)
    ########################################################
    edges_lst.append((edge_index,edge_attr))
    

## create dataset

In [7]:
dataset = []
for i in range(len(X_y)):
    X = X_y[i][0]
    y = X_y[i][1]
    edge_index = edges_lst[i][0]
    edge_attr = edges_lst[i][1]
    data = Data(x=X, y=y, edge_index=edge_index, edge_attr=edge_attr)
    dataset.append(data)

## model (change this for different NN structure)

In [8]:
class GNN(torch.nn.Module):
    def __init__(self, input_size, feature_size, output_size):
        super().__init__()
        self.conv = GCNConv(in_channels=input_size, out_channels=feature_size)
        self.activation = nn.ReLU()
        self.fc = nn.Linear(in_features=feature_size, out_features=output_size)
        
    
    def forward(self, data):
        node_attr = F.normalize(data.x.float(), dim=0)
        num_pad = self.conv.in_channels - node_attr.shape[1]
        node_attr = torch.cat(
            (
                node_attr,
                torch.zeros((node_attr.shape[0], num_pad))
            ),
            -1
        )
        edge_index = data.edge_index.long()
        edge_weight = F.normalize(data.edge_attr.float().reshape(-1, 1), dim=0)
        x = self.conv(
                    x=node_attr, 
                    edge_index=edge_index, 
                    edge_weight=edge_weight
                    )
        x = self.activation(x)
        x = self.fc(x)
        return torch.sigmoid(x)


# training starts below

In [9]:
from sklearn import metrics
class Trainer:
    def __init__(self, model, dataset, args):
        self.device = args.device
        self.model = model.to(self.device)
        self.dataset = dataset
        self.epochs = args.num_epochs
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr = args.learning_rate)
        self.criterion = nn.CrossEntropyLoss().to(self.device)
        self.val_idx = int(len(self.dataset) * (1-args.val_size))
        self.best_model_weights = self.model.state_dict()
        self.best_epoch = 0
        self.best_val_loss = float('inf')
        self.threshold = args.threshold
    def train(self):
        for epoch in range(1, self.epochs + 1):
            train_loss = 0.0
            val_loss = 0.0
            for i, data in enumerate(self.dataset):
                if i < self.val_idx:
                    loss = self._train_step(self.model, data)
                    train_loss += loss / self.val_idx
                else:
                    loss = self._val_step(self.model, data)
                    val_loss += loss / (len(self.dataset) - self.val_idx)
            # if epoch % 20 == 0 or epoch == self.epochs:
            #     print(f"""
            #         epoch {epoch}:
            #             train loss: {train_loss},
            #             val loss: {val_loss}
            #     """)
            if self.best_val_loss > val_loss:
                self.best_val_loss = val_loss
                self.best_epoch = epoch
                self.best_model_weights = model.state_dict()
        self.model.load_state_dict(self.best_model_weights)
        print(
            f"""
            best model loss is:
                val loss: {self.best_val_loss} @ epoch: {self.best_epoch}
            """
        )
        self._benchmark()
        return self.model

    def _train_step(self, model, data):
        self.optimizer.zero_grad()
        logits, target = self._shared_step(model, data)
        loss = self.criterion(logits, target)
        loss.backward()
        self.optimizer.step()
        return loss.item()
    def _val_step(self, model, data):
        with torch.no_grad():
            logits, target = self._shared_step(model, data)
            loss = self.criterion(logits, target)
            return loss.item()
    def _shared_step(self, model, data):
        data.x = data.x.to(self.device)
        data.edge_index = data.edge_index.to(self.device)
        data.edge_attr = data.edge_attr.to(self.device)
        target = data.y.long().to(self.device)
        logits = model(data)
        return logits, target
    def _benchmark(self):
        train_preds = []
        train_trues = []
        val_preds = []
        val_trues = []
        for i, data in enumerate(self.dataset):
            logits, target = self._shared_step(self.model, data)
            pred = logits.argmax(-1).cpu().numpy()
            target = target.cpu().numpy()
            if i < self.val_idx:
                train_preds.append(pred)
                train_trues.append(target)
            else:
                val_preds.append(pred)
                val_trues.append(target)
        train_preds = np.hstack(train_preds)
        train_trues = np.hstack(train_trues)
        val_preds = np.hstack(val_preds)
        val_trues = np.hstack(val_trues)
        print(
            f"""
                best model performance is:
                    train acc: {metrics.accuracy_score(train_trues, train_preds)}
                    val acc: {metrics.accuracy_score(val_trues, val_preds)}

                    train f1 score {metrics.f1_score(train_trues, train_preds)}
                    val f1 score {metrics.f1_score(val_trues, val_preds)}

                    train precision score {metrics.precision_score(train_trues, train_preds)}
                    val precision score {metrics.precision_score(val_trues, val_preds)}

                    train recall score {metrics.recall_score(train_trues, train_preds)}
                    val recall score {metrics.recall_score(val_trues, val_preds)}

                    num of pos prediction in training set {train_preds[train_preds == 1].shape[0]}
                    num of neg prediction in training set {train_preds[train_preds == 0].shape[0]}
                    num of pos prediction in val set {val_preds[val_preds == 1].shape[0]}
                    num of neg prediction in val set {val_preds[val_preds == 0].shape[0]}
            """
        )
        print(
            metrics.classification_report(val_trues, val_preds)
            )

# hyperparams (change this e.g. change logit threshold for predictions)

In [10]:
model = GNN(70, 32, 2)
args = dict(
    num_epochs = 100,
    learning_rate = 2e-5,
    device = "cpu",
    val_size = .2,
    ###########################
    # not implemented
    ###########################
    threshold = 0 
)

args = argparse.Namespace(**args)
trainer = Trainer(model, dataset, args)

In [11]:
model = trainer.train()


            best model loss is:
                val loss: 0.48381937046845763 @ epoch: 100
            

                best model performance is:
                    train acc: 1.0
                    val acc: 1.0

                    train f1 score 0.0
                    val f1 score 0.0

                    train precision score 0.0
                    val precision score 0.0

                    train recall score 0.0
                    val recall score 0.0

                    num of pos prediction in training set 0
                    num of neg prediction in training set 1392
                    num of pos prediction in val set 0
                    num of neg prediction in val set 348
            
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       348

    accuracy                           1.00       348
   macro avg       1.00      1.00      1.00       348
weighted avg       1.00      1.00      1.00       348



  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
