

Initializing and training the LLAMA model for use

The following were used to put this notebook together, and adapted for our purposes to classify geolocation data.


https://huggingface.co/docs/transformers/en/training


https://github.com/artidoro/qlora


https://github.com/adidror005/youtube-videos/blob/main/LLAMA_3_Fine_Tuning_for_Sequence_Classification_Actual_Video.ipynb



In [None]:
!pip cache purge

Files removed: 16


In [None]:
# !pip install transformers huggingface-hub datasets -qqq
%pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"



Prepare the dataset for training data and processing data. We enter the dataset into a pandas DataFrame and then define training and evaulation, so that we can specify the LLAMA model.

In [None]:
import pandas as pd
import tarfile
import re
import json

def create_dataframe_from_tar(file_path):
    data = []
    try:
        with tarfile.open(file_path, 'r:gz') as tar:
            for member in tar.getmembers():
                if member.isfile() and member.name.endswith('output.txt'):
                    try:
                        f = tar.extractfile(member)
                        content = f.read().decode('utf-8')
                        match = re.search(r'([^/]+)_canvas_([^/]+)/', member.name)
                        if match:
                            country = match.group(1)
                            description = content.strip()
                            data.append({'Country': country, 'Description': description})
                    except Exception as e:
                        print(f"Error processing file {member.name}: {e}")
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

    return pd.DataFrame(data)

file_path = '/content/drive/MyDrive/llava_flickr_geoguessr_data.tar.gz'
df = create_dataframe_from_tar(file_path)

if df is not None:
    print(df.head())

  Country                                        Description
0  Mexico  The image shows a rural area with a dirt road ...
1   Japan  The image shows a street scene in a city, with...
2   India  The image depicts a busy street in a foreign c...
3   Japan  The image depicts a parking lot filled with va...
4   Spain  The image shows a large, open field with a roa...


In [None]:
df

Unnamed: 0,Country,Description
0,Mexico,The image shows a rural area with a dirt road ...
1,Japan,"The image shows a street scene in a city, with..."
2,India,The image depicts a busy street in a foreign c...
3,Japan,The image depicts a parking lot filled with va...
4,Spain,"The image shows a large, open field with a roa..."
...,...,...
33101,Russia,The image shows a rural countryside setting wi...
33102,Belgium,The image shows a street scene with a mix of t...
33103,Argentina,The image shows a rural countryside with a two...
33104,Norway,The image shows a snowy street with a car driv...


In [None]:
# Cull country entries that don't have enough data to train from
df = df.groupby('Country').filter(lambda x: len(x) >= 4)

In [None]:
df['country']=df['Country'].astype('category')
df['target']=df['country'].cat.codes
df.head()

Unnamed: 0,Country,Description,country,target
0,Mexico,The image shows a rural area with a dirt road ...,Mexico,56
1,Japan,"The image shows a street scene in a city, with...",Japan,41
2,India,The image depicts a busy street in a foreign c...,India,36
3,Japan,The image depicts a parking lot filled with va...,Japan,41
4,Spain,"The image shows a large, open field with a roa...",Spain,76


In [None]:
df['country'].cat.categories

Index(['Aland', 'Albania', 'Andorra', 'Argentina', 'Australia', 'Austria',
       'Bangladesh', 'Belgium', 'Bhutan', 'Bolivia', 'Botswana', 'Brazil',
       'Bulgaria', 'Cambodia', 'Canada', 'Chile', 'China', 'Colombia',
       'Croatia', 'Curacao', 'Czechia', 'Denmark', 'Ecuador', 'Egypt',
       'Estonia', 'Eswatini', 'Finland', 'France', 'Germany', 'Ghana',
       'Greece', 'Greenland', 'Guam', 'Guatemala', 'Hungary', 'Iceland',
       'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Jersey',
       'Jordan', 'Kenya', 'Kyrgyzstan', 'Laos', 'Latvia', 'Lebanon', 'Lesotho',
       'Lithuania', 'Luxembourg', 'Macao', 'Madagascar', 'Malaysia', 'Malta',
       'Mexico', 'Mongolia', 'Montenegro', 'Netherlands', 'Nigeria', 'Norway',
       'Pakistan', 'Palestine', 'Peru', 'Philippines', 'Poland', 'Portugal',
       'Reunion', 'Romania', 'Russia', 'Senegal', 'Serbia', 'Singapore',
       'Slovakia', 'Slovenia', 'Spain', 'Sweden', 'Switzerland', 'Taiwan',
       'Thailand', 'Tuni

In [None]:
country_map = {code: category for code, category in enumerate(df['country'].cat.categories)}
import heapq
country_keys = heapq.nlargest(10, country_map, key=country_map.get)
for key in country_keys:
    print(f"{key}: {country_map[key]}")

86: Vietnam
85: Uruguay
84: Ukraine
83: Uganda
82: Turkey
81: Tunisia
80: Thailand
79: Taiwan
78: Switzerland
77: Sweden


In [None]:
country_map = {code: category for code, category in enumerate(df['country'].cat.categories)}
import heapq
country_keys = heapq.nlargest(10, country_map, key=country_map.get)
for key in country_keys:
    print(f"{key}: {country_map[key]}")

86: Vietnam
85: Uruguay
84: Ukraine
83: Uganda
82: Turkey
81: Tunisia
80: Thailand
79: Taiwan
78: Switzerland
77: Sweden


In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split

df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Country'])
df_train, df_val = train_test_split(df_train_val, test_size=0.25, random_state=42, stratify=df_train_val['Country'])

print("Train size:", len(df_train))
print("Validation size:", len(df_val))
print("Test size:", len(df_test))

print(f"Original DataFrame size: {len(df)}")
print(f"Train DataFrame size: {len(df_train)}")
print(f"Validation DataFrame size: {len(df_val)}")
print(f"Test DataFrame size: {len(df_test)}")

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

Train size: 19850
Validation size: 6617
Test size: 6617
Original DataFrame size: 33084
Train DataFrame size: 19850
Validation DataFrame size: 6617
Test DataFrame size: 6617
(19850, 4)
(6617, 4)
(6617, 4)


In [None]:
# Import libraries needed
from huggingface_hub import snapshot_download, login
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

In [None]:
dataset_train = Dataset.from_pandas(df_train.drop('country',axis=1))
dataset_val = Dataset.from_pandas(df_val.drop('country',axis=1))
dataset_test = Dataset.from_pandas(df_test.drop('country',axis=1))

In [None]:
dataset_train_shuffled = dataset_train.shuffle(seed=42)

In [None]:
dataset = DatasetDict({
    'train': dataset_train_shuffled,
    'val': dataset_val,
    'test': dataset_test
})
dataset

DatasetDict({
    train: Dataset({
        features: ['Country', 'Description', 'target', '__index_level_0__'],
        num_rows: 19850
    })
    val: Dataset({
        features: ['Country', 'Description', 'target', '__index_level_0__'],
        num_rows: 6617
    })
    test: Dataset({
        features: ['Country', 'Description', 'target', '__index_level_0__'],
        num_rows: 6617
    })
})

In [None]:
dataset['train']

Dataset({
    features: ['Country', 'Description', 'target', '__index_level_0__'],
    num_rows: 19850
})

In [None]:
df_train.target.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
41,0.116071
27,0.107960
11,0.070126
70,0.053249
4,0.051486
...,...
31,0.000353
0,0.000252
48,0.000252
19,0.000252


In [None]:
class_weights=(1/df_train.target.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.0687, 0.0137, 0.0430, 0.0008, 0.0003, 0.0017, 0.0054, 0.0026, 0.0286,
        0.0049, 0.0040, 0.0002, 0.0026, 0.0048, 0.0004, 0.0018, 0.0430, 0.0023,
        0.0045, 0.0687, 0.0022, 0.0029, 0.0061, 0.0202, 0.0058, 0.0090, 0.0005,
        0.0002, 0.0008, 0.0053, 0.0023, 0.0491, 0.0687, 0.0073, 0.0034, 0.0107,
        0.0036, 0.0020, 0.0020, 0.0018, 0.0007, 0.0001, 0.0430, 0.0067, 0.0044,
        0.0078, 0.0093, 0.0048, 0.0687, 0.0088, 0.0041, 0.0229, 0.0430, 0.0430,
        0.0014, 0.0104, 0.0006, 0.0069, 0.0172, 0.0010, 0.0046, 0.0008, 0.0312,
        0.0123, 0.0021, 0.0026, 0.0007, 0.0024, 0.0202, 0.0017, 0.0003, 0.0076,
        0.0090, 0.0008, 0.0053, 0.0086, 0.0005, 0.0008, 0.0033, 0.0010, 0.0006,
        0.0065, 0.0021, 0.0104, 0.0051, 0.0098, 0.0382])

In [None]:
# Login to Hugging Face
login(token="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")  # Replace with your actual token
model_dir = "./models/Meta-Llama-3-8B"
model_name = "meta-llama/Llama-3-8B"
# model_dir = "./models/Llama-3.2-1B"
# snapshot_download(repo_id="meta-llama/Meta-Llama-3.2-1B", local_dir=model_dir)
snapshot_download(repo_id="meta-llama/Meta-Llama-3-8B", local_dir=model_dir)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 17 files:   0%|          | 0/17 [00:00<?, ?it/s]

'/content/models/Meta-Llama-3-8B'

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16
)

In [None]:
lora_config = LoraConfig(
    r = 16,
    lora_alpha = 8,
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = 'SEQ_CLS'
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_dir,
    quantization_config=quantization_config,
    num_labels=87
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ./models/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [None]:
model = prepare_model_for_kbit_training(model)

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [None]:
model = get_peft_model(model, lora_config)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
        

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_dir, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
sentences = df_test.Description.tolist()
sentences[0:2]

['The scene is set in a city intersection with a group of people crossing the street. There are two men and a woman crossing the street, and they are all wearing backpacks. The intersection is surrounded by tall buildings, which are likely a mix of residential and commercial structures. The street is busy with traffic, including cars and a truck, and there are multiple traffic lights to control the flow of vehicles.\n\nThe people in the image are of different races, and they appear to be diverse in their appearances. The cars on the street are of various makes and models, and the traffic lights are in different colors, indicating different traffic signals. The street signs are in a foreign language, which suggests that the city is located in a non-English speaking country.',
 'The image shows a group of people walking on a bridge over a body of water, possibly a river or a lake. The bridge is a long, white structure that spans across the water. The people are walking in a line, with so

In [None]:
df

Unnamed: 0,Country,Description,country,target
0,Mexico,The image shows a rural area with a dirt road ...,Mexico,56
1,Japan,"The image shows a street scene in a city, with...",Japan,41
2,India,The image depicts a busy street in a foreign c...,India,36
3,Japan,The image depicts a parking lot filled with va...,Japan,41
4,Spain,"The image shows a large, open field with a roa...",Spain,76
...,...,...,...,...
33101,Russia,The image shows a rural countryside setting wi...,Russia,70
33102,Belgium,The image shows a street scene with a mix of t...,Belgium,7
33103,Argentina,The image shows a rural countryside with a two...,Argentina,3
33104,Norway,The image shows a snowy street with a car driv...,Norway,61


In [None]:
# Test predictions on Description
# sentences = df_test.Description.tolist()

# batch_size = 32
# all_outputs = []

# for i in range(0, len(sentences), batch_size):
#     batch_sentences = sentences[i:i + batch_size]

#     inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

#     with torch.no_grad():
#         outputs = model(**inputs)
#         all_outputs.append(outputs['logits'])

In [None]:
# final_outputs = torch.cat(all_outputs, dim=0)
# final_outputs

In [None]:
# final_outputs.argmax(axis=1)

In [None]:
# df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
# df_test['predictions']

In [None]:
# df_test['predictions']=df_test['predictions'].apply(lambda l:country_map[l])
# df_test['predictions']

In [None]:
def get_performance_metrics(df_test):
  y_test = df_test.Country
  y_pred = df_test.predictions

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
# get_performance_metrics(df_test)

In [None]:
MAX_LEN = 1024
col_to_delete = ['Description']

def llama_preprocessing_function(examples):
    return tokenizer(examples['Description'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/19850 [00:00<?, ? examples/s]

Map:   0%|          | 0/6617 [00:00<?, ? examples/s]

Map:   0%|          | 0/6617 [00:00<?, ? examples/s]

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':accuracy_score(predictions,labels)}

In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").long()
        outputs = model(**inputs)
        logits = outputs.get('logits')
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir = 'sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 2,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

In [None]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights,
)

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)


In [None]:


# valid_labels = set(df_cull['target'].unique())
# df_filtered = df_cull[df_cull['target'].isin(valid_labels)]
# dataset_train = Dataset.from_pandas(df_filtered.drop('country',axis=1))


In [None]:
MAX_LEN = 1024
col_to_delete = ['Description']

def llama_preprocessing_function(examples):
    return tokenizer(examples['Description'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("target", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/19850 [00:00<?, ? examples/s]

Map:   0%|          | 0/6617 [00:00<?, ? examples/s]

Map:   0%|          | 0/6617 [00:00<?, ? examples/s]

In [None]:
print("Checking training dataset:")
for example in tokenized_datasets["train"]:
  if not (0 <= example["label"] < 87):
    print(f"Error: Invalid label {example['label']} in training dataset")
    break

print("Checking validation dataset:")
for example in tokenized_datasets["val"]:
  if not (0 <= example["label"] < 87):
    print(f"Error: Invalid label {example['label']} in validation dataset")
    break

train_result = trainer.train()

Checking training dataset:
Checking validation dataset:


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjwesley1[0m ([33mjwesley1-the-university-of-new-mexico[0m). Use [1m`wandb login --relogin`[0m to force relogin


  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Balanced Accuracy,Accuracy
1,4.4508,4.254467,0.131025,0.081608


  return fn(*args, **kwargs)


In [None]:
def make_predictions(model,df_test):


  sentences = df_test.Description.tolist()


  batch_size = 64

  all_outputs = []

  for i in range(0, len(sentences), batch_size):

      batch_sentences = sentences[i:i + batch_size]

      inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

      inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

      with torch.no_grad():
          outputs = model(**inputs)
          all_outputs.append(outputs['logits'])
  final_outputs = torch.cat(all_outputs, dim=0)
  df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
  df_test['predictions']=df_test['predictions'].apply(lambda l:country_map[l])


make_predictions(model,df_test)

In [None]:
get_performance_metrics(df_test)

In [None]:
metrics = train_result.metrics
max_train_samples = len(dataset_train)
metrics["train_samples"] = min(max_train_samples, len(dataset_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
trainer.save_model("geo_model")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r saved_model /content/drive/MyDrive/