In [1]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:
!pip install auto-gptq
!pip install optimum
!pip install bitsandbytes
!pip install peft
!pip install datasets
!pip install trl
!pip install accelerate

Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.26.0 (from auto-gptq)
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from auto-gptq)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.1.1-py3-none-any.whl (13.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m
Collecting

In [1]:
import re
import ast
import torch
import pandas as pd
import numpy as np
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from datasets import Dataset
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import transformers
from trl import SFTTrainer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [4]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
crypto_df = pd.read_csv("/content/drive/MyDrive/Crypto_News_Prepped.csv")
crypto_df

Unnamed: 0,date,sentiment,source,subject,text,title,url
0,2023-12-19 06:40:41,"{'class': 'negative', 'polarity': -0.1, 'subje...",CryptoNews,altcoin,Grayscale CEO Michael Sonnenshein believes the...,Grayscale CEO Calls for Simultaneous Approval ...,https://cryptonews.comhttps://cryptonews.com/n...
1,2023-12-19 06:03:24,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,"In an exclusive interview with CryptoNews, Man...",Indian Government is Actively Collaborating Wi...,https://cryptonews.comhttps://cryptonews.com/n...
2,2023-12-19 05:55:14,"{'class': 'positive', 'polarity': 0.05, 'subje...",CryptoNews,blockchain,According to the Federal Court ruling on Decem...,Judge Approves Settlement: Binance to Pay $1.5...,https://cryptonews.comhttps://cryptonews.com/n...
3,2023-12-19 05:35:26,"{'class': 'positive', 'polarity': 0.5, 'subjec...",CoinTelegraph,blockchain,Some suggest EVM inscriptions are the latest w...,Why a gold rush for inscriptions has broken ha...,https://cointelegraph.com/news/inscriptions-ev...
4,2023-12-19 05:31:08,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CoinTelegraph,ethereum,A decision by bloXroute Labs to start censorin...,‘Concerning precedent’ — bloXroute Labs' MEV r...,https://cointelegraph.com/news/concerning-prec...
...,...,...,...,...,...,...,...
31032,2021-10-27 15:17:00,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,defi,Cream Finance (CREAM) suffered another flash l...,Cream Finance Suffers Another Exploit as Attac...,https://cryptonews.com/news/cream-finance-suff...
31033,2021-10-19 13:39:00,"{'class': 'positive', 'polarity': 0.1, 'subjec...",CryptoNews,blockchain,Banque de France disclosed the results of its ...,French Central Bank's Blockchain Bond Trial Br...,https://cryptonews.com/news/french-central-ban...
31034,2021-10-18 13:58:00,"{'class': 'positive', 'polarity': 0.14, 'subje...",CryptoNews,blockchain,Advancing its project to become \x9caÂ\xa0meta...,"Facebook To Add 10,000 Jobs In EU For Metavers...",https://cryptonews.com/news/facebook-to-add-10...
31035,2021-10-15 00:00:00,"{'class': 'neutral', 'polarity': 0.0, 'subject...",CryptoNews,blockchain,Chinese companies are still topping the blockc...,Tech Crackdown Hasn't Halted Chinese Firms' Bl...,https://cryptonews.com/news/tech-crackdown-has...


In [7]:
# Split the data into training and testing sets for each sentiment category
# ("positive", "neutral", "negative"), allocating number of samples, ensuring
# that each sentiment category has 6000 samples for training and 2000 samples for testing

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(crypto_df[crypto_df.sentiment==sentiment],
                                    train_size=4700,
                                    test_size=1800,
                                    random_state=42)

    # Append the training and testing sets for each sentiment category to separate lists
    X_train.append(train)
    X_test.append(test)

In [8]:
# Concatenate the training sets for all sentiment categories into a single DataFrame and shuffle it
X_train = pd.concat(X_train).sample(frac=1, random_state=10)

# Concatenate the test sets for all sentiment categories into a single DataFrame
X_test = pd.concat(X_test)

In [9]:
X_train["sentiment"].value_counts()

sentiment
neutral     4700
negative    4700
positive    4700
Name: count, dtype: int64

In [10]:
# Select 50 random samples from each sentiment category that were not included in the training or
# testing sets, ensuring that each sentiment category has  samples for evaluation

eval_idx = [idx for idx in crypto_df.index if idx not in list(X_train.index) + list(X_test.index)]
X_eval = crypto_df[crypto_df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))

# Reset the index of the training set and drop the old index.
X_train = X_train.reset_index(drop=True)

In [11]:
X_eval["sentiment"].value_counts()

sentiment
negative    50
neutral     50
positive    50
Name: count, dtype: int64

In [12]:
X_train.shape, X_test.shape, X_eval.shape

((14100, 9), (5400, 9), (150, 9))

In [13]:
dataset = {
    "train" : Dataset.from_pandas(X_train),
    "test" : Dataset.from_pandas(X_test),
    "val" : Dataset.from_pandas(X_eval)
}
dataset

{'train': Dataset({
     features: ['date', 'source', 'subject', 'text', 'title', 'url', 'sentiment', 'polarity', 'subjectivity'],
     num_rows: 14100
 }),
 'test': Dataset({
     features: ['date', 'source', 'subject', 'text', 'title', 'url', 'sentiment', 'polarity', 'subjectivity', '__index_level_0__'],
     num_rows: 5400
 }),
 'val': Dataset({
     features: ['date', 'source', 'subject', 'text', 'title', 'url', 'sentiment', 'polarity', 'subjectivity', '__index_level_0__'],
     num_rows: 150
 })}

In [14]:
#import accelerate

# Load Tinyllama model

model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
pad_token = "<pad>"

# Load Tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.add_special_tokens({"pad_token" : pad_token})
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto", # automatically figures out how to best use CPU + GPU for loading model
                                             trust_remote_code=False, # prevents running custom model files on device
                                             revision="main") # which version of model to use

model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embedding(32004, 2048)

In [20]:
tokenizer.bos_token, tokenizer.bos_token_id


('<s>', 1)

In [21]:
tokenizer.eos_token, tokenizer.eos_token_id


('</s>', 2)

In [22]:
tokenizer.pad_token, tokenizer.pad_token_id

('<pad>', 32000)

In [23]:
pad_token in tokenizer.get_vocab()

True

In [15]:
import inspect

def format_example(example):
  return inspect.cleandoc(f"""
  ### Title:
  {example["title"]}
  ### Text:
  {example["text"]}
  ### Prediction:
  subject : {example["subject"]}
  sentiment : {example["sentiment"]}
  """
  )

In [25]:
print(format_example(dataset['train'][0]))

### Title:
Crypto’s Rapid Growth Pushed Canada to Speed up Regulations
### Text:
The Canadian authorities have started consultations with shareholders to determine what regulations to impose on the crypto industry.
### Prediction:
subject : altcoin
sentiment : neutral


In [16]:
# Training

model.pad_toke_id = tokenizer.pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [27]:
model.config

LlamaConfig {
  "_name_or_path": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 22,
  "num_key_value_heads": 4,
  "pad_token_id": 32000,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.40.0",
  "use_cache": true,
  "vocab_size": 32008
}

In [28]:
# check model architecture to identify where to apply LORA
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32008, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [17]:
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.1,
        r=128,
        bias="none",
        target_modules=[
            "self_attn.k_proj",
            "self_attn.o_proj",
            "self_attn.q_proj",
            "self_attn.v_proj",
            "mlp.down_proj",
            "mlp.gate_proj",
            "mlp.up_proj"
            ],
        task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 100,925,440 || all params: 1,200,990,208 || trainable%: 8.403518973570183


In [18]:
# Training on completions

response_template = "\n### Prediction:"
response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

In [19]:
#Training Arguements
training_arguments = TrainingArguments(
    output_dir="./results",                   # directory to save and repository id
    num_train_epochs=1,                       # number of training epochs
    per_device_train_batch_size=4,            # batch size per device during training
    gradient_accumulation_steps=4,            # number of steps before performing a backward/update pass
    #gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="adamw_torch",
    #save_steps=0,
    logging_steps=10,                         # log every 10 steps
    learning_rate=1e-4,                       # learning rate, based on QLoRA paper
    #weight_decay=0.001,
    fp16=True,
    #bf16=False,
    max_grad_norm=1.0,                        # max gradient norm based on QLoRA paper
    #max_steps=-1,
    eval_steps=0.2,
    warmup_ratio=0.1,                         # warmup ratio based on QLoRA paper
    #group_by_length=True,
    lr_scheduler_type="constant",             # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    save_strategy="epoch",                    # save checkpoint every epoch
    evaluation_strategy='steps',
    save_safetensors=True,
    seed=0
)

In [20]:
def format_prompt(example):
  output = []
  for i in range(len(example['title'])):
    text = inspect.cleandoc(f"""
  ### Title:
  {example["title"]}
  ### Text:
  {example["text"]}
  ### Prediction:
  subject : {example["subject"]}
  sentiment : {example["sentiment"]}
  """
  )
    output.append(text)
  return output

In [21]:
for param in model.parameters():
    param.requires_grad = True

In [22]:
# Free up GPU memory
torch.cuda.empty_cache()

In [23]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    #peft_config=peft_config,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func=format_prompt,
    data_collator=collator,
    packing=False,
)

# Train model
trainer.train()

Map:   0%|          | 0/14100 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

['Binance CEO: Using Crypto to Avoid Sanctions Is A Myth', 'Kraken receives virtual asset service provider authorization in Ireland ahead of MiCA vote', '236K BTC Sold by Large Institutions Since Terra’s Implosion in May', 'Study: Insider trading occurs in 10% to 25% of cryptocurrency listings', 'Nifty News: Total BAYC thefts crack $18.5 million, “Ape Now, Pay Later” loans come for NFTs and more', 'Bitcoin and Ethereum Target Fresh Lows, Altcoins Decline', "Voyager's $1B sale to Binance.US put on hold by US court", "China's Fujian Province Has Processed $22bn Worth of Digital Yuan \\x93 Are CBDCs Taking Over?", "Bitcoin hodlers will 'soon see why' $21.6K BTC price pump is fake — trader", 'Mintlayer: Shaping the Future of DeFi on Bitcoin', 'Litecoin Foundation’s managing director shares his thoughts on decentralized money', 'Ethereum Mining Difficulty Delayed Again Ahead of Big Transition', 'Cambodia’s Bakong Digital Currency Teams Up with Alipay for Expanded Reach', 'Ukraine DAO raises

OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 31.06 MiB is free. Process 137714 has 14.71 GiB memory in use. Of the allocated memory 14.43 GiB is allocated by PyTorch, and 161.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)