# Tokenization

In [None]:
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch



def tokenization(inputpath):

    df = pd.read_csv(inputpath)

    df['sentence'] = df['sentence'].astype(str)

    tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
    model = AutoModel.from_pretrained("yiyanghkust/finbert-tone")

    tokens = tokenizer(df['sentence'].tolist(), padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1) 

    labels = torch.tensor(df['Label'])

    return embeddings, labels




  from .autonotebook import tqdm as notebook_tqdm
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


# Splitting Data

In [5]:
from sklearn.model_selection import train_test_split

def prepare_data(inputpath):
    embeddings, labels = tokenization(inputpath)
    X = embeddings.numpy()
    y = labels.numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = prepare_data('../cleaning/concatenated.csv')


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


# Model

In [15]:
class ANN(torch.nn.Module):
    def __init__(self):
        super(ANN,self).__init__()
        self.fc2 = torch.nn.Linear(768, 512)
        self.fc3 = torch.nn.Linear(512, 256)
        self.fc4 = torch.nn.Linear(256, 2)

    def forward(self, x):
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)

        return x
    

In [16]:
ANN = ANN()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ANN.parameters(), lr=0.001 , weight_decay=0.0001)

def trainclassifier(classifier, X_train, y_train):

    classifier.train()

    for epoch in range(100):
        inputs = torch.Tensor(torch.Tensor(X_train).float())
        labels = torch.Tensor(torch.Tensor(y_train).long())

        optimizer.zero_grad()

        outputs = ANN(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if epoch % 5 == 0:
            print('Epoch {}, Loss {}'.format(epoch,loss.item()))

    return ANN

classifier = trainclassifier(ANN, X_train, y_train)


Epoch 0, Loss 0.7024738192558289
Epoch 5, Loss 0.09602687507867813
Epoch 10, Loss 0.07050268352031708
Epoch 15, Loss 0.05883525311946869
Epoch 20, Loss 0.039825353771448135
Epoch 25, Loss 0.030922414734959602
Epoch 30, Loss 0.024480929598212242
Epoch 35, Loss 0.018110984936356544
Epoch 40, Loss 0.013544993475079536
Epoch 45, Loss 0.009159705601632595
Epoch 50, Loss 0.005838892888277769
Epoch 55, Loss 0.0034598554484546185
Epoch 60, Loss 0.0020118679385632277
Epoch 65, Loss 0.0012570340186357498
Epoch 70, Loss 0.0008497158414684236
Epoch 75, Loss 0.0006181419012136757
Epoch 80, Loss 0.0004804849158972502
Epoch 85, Loss 0.0003948535886593163
Epoch 90, Loss 0.00033835184876807034
Epoch 95, Loss 0.0002999391872435808
