In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

SEED = 42

In [2]:
data = pd.read_csv("stock_data.csv")
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
data["Sentiment"].value_counts()

 1    3685
-1    2106
Name: Sentiment, dtype: int64

In [4]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=SEED)

data_train, data_val = train_test_split(data_train, test_size=0.2, random_state=SEED)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
def tokenize(data):
    tokenized_data = data["Text"].apply(lambda x: tokenizer(x, padding=True, truncation=True)).apply(pd.Series)

    tokenized_data['text'] = data['Text']
    tokenized_data['label'] = data['Sentiment']
    return tokenized_data[['text', 'label', 'input_ids', 'attention_mask']]

tokenized_train = tokenize(data_train)
tokenized_val = tokenize(data_val)
tokenized_test = tokenize(data_test)

In [7]:
tokenized_data = tokenize(data)
compare = pd.DataFrame()
compare['text'] = tokenized_data['text']
compare['tokenized_text'] = tokenized_data["input_ids"].apply(lambda x: tokenizer.convert_ids_to_tokens(x))
compare

Unnamed: 0,text,tokenized_text
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,"[[CLS], kicker, ##s, on, my, watch, ##list, xi..."
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,"[[CLS], user, :, aa, ##p, movie, ., 55, %, ret..."
2,user I'd be afraid to short AMZN - they are lo...,"[[CLS], user, i, ', d, be, afraid, to, short, ..."
3,MNTA Over 12.00,"[[CLS], mn, ##ta, over, 12, ., 00, [SEP]]"
4,OI Over 21.37,"[[CLS], o, ##i, over, 21, ., 37, [SEP]]"
...,...,...
5786,Industry body CII said #discoms are likely to ...,"[[CLS], industry, body, ci, ##i, said, #, disc..."
5787,"#Gold prices slip below Rs 46,000 as #investor...","[[CLS], #, gold, prices, slip, below, rs, 46, ..."
5788,Workers at Bajaj Auto have agreed to a 10% wag...,"[[CLS], workers, at, baja, ##j, auto, have, ag..."
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...","[[CLS], #, share, ##market, live, :, sense, ##..."


In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [11]:
training_args = TrainingArguments(
    output_dir="sentiment_finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
#    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

KeyError: ignored

In [None]:
trainer.evaluate()

In [None]:
preds_output = trainer.predict(tokenized_test)

In [None]:
preds_output.metrics