In [1]:
import numpy as np
import pandas as pd
from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

SEED = 42

In [2]:
data = pd.read_csv("stock_data.csv")
data.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
data["Sentiment"].value_counts()

 1    3685
-1    2106
Name: Sentiment, dtype: int64

In [4]:
# Convert to 0/1
data["Sentiment"] = data["Sentiment"].apply(lambda x: 1 if x == 1 else 0)
data["Sentiment"].value_counts()

1    3685
0    2106
Name: Sentiment, dtype: int64

In [5]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=SEED)

data_train, data_val = train_test_split(data_train, test_size=0.2, random_state=SEED)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
def tokenize(data):
    tokenized_data = data["Text"].apply(lambda x: tokenizer(x, padding=True, truncation=True)).apply(pd.Series)

    tokenized_data['text'] = data['Text']
    tokenized_data['label'] = data['Sentiment']
    return tokenized_data[['text', 'label', 'input_ids', 'attention_mask']]

tokenized_train = tokenize(data_train)
tokenized_val = tokenize(data_val)
tokenized_test = tokenize(data_test)

In [8]:
tokenized_data = tokenize(data)
compare = pd.DataFrame()
compare['text'] = tokenized_data['text']
compare['tokenized_text'] = tokenized_data["input_ids"].apply(lambda x: tokenizer.convert_ids_to_tokens(x))
compare

Unnamed: 0,text,tokenized_text
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,"[[CLS], kicker, ##s, on, my, watch, ##list, xi..."
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,"[[CLS], user, :, aa, ##p, movie, ., 55, %, ret..."
2,user I'd be afraid to short AMZN - they are lo...,"[[CLS], user, i, ', d, be, afraid, to, short, ..."
3,MNTA Over 12.00,"[[CLS], mn, ##ta, over, 12, ., 00, [SEP]]"
4,OI Over 21.37,"[[CLS], o, ##i, over, 21, ., 37, [SEP]]"
...,...,...
5786,Industry body CII said #discoms are likely to ...,"[[CLS], industry, body, ci, ##i, said, #, disc..."
5787,"#Gold prices slip below Rs 46,000 as #investor...","[[CLS], #, gold, prices, slip, below, rs, 46, ..."
5788,Workers at Bajaj Auto have agreed to a 10% wag...,"[[CLS], workers, at, baja, ##j, auto, have, ag..."
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...","[[CLS], #, share, ##market, live, :, sense, ##..."


In [9]:
print(Dataset.from_pandas(tokenized_train))
print(tokenized_train.head())

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask', '__index_level_0__'],
    num_rows: 3705
})
                                                   text  label  \
1164  VXY AAP Blowout Huge Dump AAP Miss #FED pump V...      0   
342   my 526.20 bid not hit in aapl so i am expectin...      1   
2997  XCO looking good, user may be onto something h...      1   
104   user: SPX 1464.90 Wave iv would be 20 points l...      0   
455   dec 6 BB&T CM initiated SSYS with a buy  68. w...      0   

                                              input_ids  \
1164  [101, 1058, 18037, 9779, 2361, 6271, 5833, 412...   
342   [101, 2026, 4720, 2575, 1012, 2322, 7226, 2025...   
2997  [101, 1060, 3597, 2559, 2204, 1010, 5310, 2089...   
104   [101, 5310, 1024, 11867, 2595, 16333, 2549, 10...   
455   [101, 11703, 1020, 22861, 1004, 1056, 4642, 75...   

                                         attention_mask  
1164  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
342   [1, 1, 1,

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [13]:
training_args = TrainingArguments(
    output_dir="sentiment_finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
#    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Dataset.from_pandas(tokenized_train),
    eval_dataset=Dataset.from_pandas(tokenized_val),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=464, training_loss=0.4783710940130826, metrics={'train_runtime': 45.0213, 'train_samples_per_second': 164.589, 'train_steps_per_second': 10.306, 'total_flos': 83483955847512.0, 'train_loss': 0.4783710940130826, 'epoch': 2.0})

In [15]:
trainer.evaluate()

{'eval_loss': 0.46223926544189453,
 'eval_accuracy': 0.790722761596548,
 'eval_f1': 0.8355932203389831,
 'eval_runtime': 1.3385,
 'eval_samples_per_second': 692.579,
 'eval_steps_per_second': 43.333,
 'epoch': 2.0}

In [17]:
preds_output = trainer.predict(Dataset.from_pandas(tokenized_test))

In [18]:
preds_output.metrics

{'test_loss': 0.45344939827919006,
 'test_accuracy': 0.7920621225194133,
 'test_f1': 0.8383635144198525,
 'test_runtime': 2.7264,
 'test_samples_per_second': 425.108,
 'test_steps_per_second': 26.776}