In [None]:
pip install transformers


In [None]:
pip install datasets

In [None]:
pip install evaluate

In [None]:
pip install transformers[torch]

In [13]:
from datasets import load_dataset
data = load_dataset("FinGPT/fingpt-sentiment-train")
dataset = data["train"]
X_train =[]
for row in dataset:
  X_train.append(row['input'])


Downloading readme:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/6.42M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/76772 [00:00<?, ? examples/s]

In [14]:
X_train1 = X_train[:100]
print(len(X_train1))
print(X_train[9]) # dataset ready

100
Financial terms were not disclosed .


In [15]:
from transformers import pipeline
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#create a sentence classification pipeline
model = "distilbert-base-uncased-finetuned-sst-2-english"
sentimentanalysis_model = AutoModelForSequenceClassification.from_pretrained(model) # this will create a copy of the BERT model
sentimentanalysis_tokenizer = AutoTokenizer.from_pretrained(model) #this will take up the predefined tokenizer that resides in the model repo
classify = pipeline("sentiment-analysis", model= sentimentanalysis_model,tokenizer = sentimentanalysis_tokenizer)
sentiment_gen = classify(X_train1) # classifying the sentences
print(sentiment_gen)
for r in sentiment_gen:
  print(r)

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'NEGATIVE', 'score': 0.9480839371681213}, {'label': 'NEGATIVE', 'score': 0.9632577300071716}, {'label': 'NEGATIVE', 'score': 0.9928142428398132}, {'label': 'POSITIVE', 'score': 0.8527814149856567}, {'label': 'NEGATIVE', 'score': 0.9992583394050598}, {'label': 'POSITIVE', 'score': 0.9638770222663879}, {'label': 'NEGATIVE', 'score': 0.8742483854293823}, {'label': 'POSITIVE', 'score': 0.9984328150749207}, {'label': 'POSITIVE', 'score': 0.9949171543121338}, {'label': 'NEGATIVE', 'score': 0.9989768266677856}, {'label': 'NEGATIVE', 'score': 0.9983569979667664}, {'label': 'NEGATIVE', 'score': 0.9995392560958862}, {'label': 'NEGATIVE', 'score': 0.9950854182243347}, {'label': 'POSITIVE', 'score': 0.9902442097663879}, {'label': 'POSITIVE', 'score': 0.9084116220474243}, {'label': 'NEGATIVE', 'score': 0.9906347393989563}, {'label': 'NEGATIVE', 'score': 0.9988527297973633}, {'label': 'NEGATIVE', 'score': 0.9995718598365784}, {'label': 'POSITIVE', 'score': 0.9995423555374146}, {'label': '

In [None]:
# creating tokens and fine tuning the model without using pipeline
batch = sentimentanalysis_tokenizer(X_train1,padding = True,truncation = True, max_length = 500,return_tensors = "pt")# if pytorch tensors required )
#batch_tensor = torch.tensor(batch)
#print(batch_tensor)
with torch.no_grad(): #incremental training, turning off the gradient update
     outputs = sentimentanalysis_model(**batch) # unpack the dictionary using
     print("output",outputs)
     predictions = F.softmax(outputs.logits,dim=1)
     print("predictions",predictions)
     labels = torch.argmax(predictions,dim=1)
     print(labels)
     labels = [sentimentanalysis_model.config.id2label[label_id] for label_id in labels.tolist()]
     print(labels)


In [22]:
# save model and tokenizer for further use
save_dir = "save_new_model"
sentimentanalysis_model.save_pretrained(save_dir)
sentimentanalysis_tokenizer.save_pretrained(save_dir)
# to laad the model again , use model = AutoModelForSequenceClassification.from_pretrained(save_dir)

('save_new_model/tokenizer_config.json',
 'save_new_model/special_tokens_map.json',
 'save_new_model/vocab.txt',
 'save_new_model/added_tokens.json',
 'save_new_model/tokenizer.json')

In [None]:
#Fine Tune Model On Yelp Review Dataset

#1.prepare dataset

from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

yelp_review_dataset = load_dataset("yelp_review_full")
yelp_review_dataset["train"][100]

#2.Tokenize the dataset

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(data):
    return tokenizer(data["text"], padding="max_length", truncation=True)
tokenized_datasets = yelp_review_dataset.map(tokenize_function, batched=True) # apply tokenise on the whole dataset

training_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) # split dataset into training and eval
evaluation_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

#4.Load pretrained model and training arguments

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)


In [None]:

training_args = TrainingArguments(output_dir="test_trainer") # arguments that needed to be trained

metric = evaluate.load("accuracy") # define a evaluation metrics to check the performance of the model

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    print("logits",logits)
    print("labels",labels)
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") # define the arguments that you want to fine tune like epoch, learning rate, weight decay and others


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_dataset,
    eval_dataset=evaluation_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.301274,0.457


logits [[-1.6708539  -1.580868   -0.7467042   1.5813575   1.9689378 ]
 [-1.7516464  -1.4258888  -0.61818117  1.6569223   1.6960309 ]
 [ 1.7169452   0.258924   -0.6948955  -0.5178424  -0.26063216]
 ...
 [-1.8355628  -1.2533593  -0.4936489   1.4926646   1.3526435 ]
 [-2.17375    -0.7757919   0.05179653  1.1895063   0.719332  ]
 [ 0.07947701  1.7976655   0.403203   -0.71973807 -2.041103  ]]
labels [2 4 1 4 3 4 2 3 2 3 0 0 3 2 2 1 3 1 2 2 1 2 3 1 1 3 4 0 0 2 2 2 1 3 4 0 0
 1 3 2 0 2 0 0 3 0 3 2 3 0 1 1 3 3 4 4 1 4 1 3 1 0 0 1 4 1 4 3 2 4 1 0 3 3
 4 1 2 1 0 4 4 4 2 3 3 1 4 0 4 2 3 0 0 0 3 4 0 0 1 4 4 0 0 1 1 0 4 2 2 1 1
 4 0 4 0 3 2 0 4 4 4 2 0 0 0 1 3 0 2 0 3 2 2 2 0 3 4 3 0 1 0 1 0 0 4 3 3 1
 3 0 3 4 0 2 1 3 1 3 1 3 1 0 2 0 1 1 0 0 3 3 3 3 3 1 1 0 0 2 3 3 3 1 1 3 4
 0 1 1 2 2 1 3 0 2 1 1 4 0 2 4 1 1 2 3 3 2 0 1 2 4 4 1 4 2 3 0 1 0 0 2 4 3
 3 0 1 1 2 0 4 0 3 0 3 2 3 1 2 4 4 2 1 0 3 1 1 1 3 0 3 0 0 2 0 3 0 3 2 4 2
 2 4 4 1 1 4 4 4 0 4 0 3 1 0 1 2 2 3 0 1 0 4 1 0 4 3 0 0 3 2 1 4 4 3 0 0 0
 0

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.301274,0.457
