In [24]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.27.0-py3-none-any.whl.metadata (18 kB)
Collecting huggingface-hub (from accelerate)
  Using cached huggingface_hub-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting safetensors>=0.3.1 (from accelerate)
  Using cached safetensors-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting fsspec (from torch>=1.10.0->accelerate)
  Downloading fsspec-2024.2.0-py3-none-any.whl.metadata (6.8 kB)
Downloading accelerate-0.27.0-py3-none-any.whl (279 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hUsing cached safetensors-0.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Using cached huggingface_hub-0.20.3-py3-none-any.whl (330 kB)
Downloading fsspec-2024.2.0-py3-none-any.whl (170 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.9/170.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m

In [25]:
!pip install transformers[torch]

Collecting transformers[torch]
  Using cached transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
Collecting regex!=2019.12.17 (from transformers[torch])
  Using cached regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers[torch])
  Using cached tokenizers-0.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached regex-2023.12.25-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
Using cached tokenizers-0.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Using cached transformers-4.37.2-py3-none-any.whl (8.4 MB)
Installing collected packages: regex, tokenizers, transformers
Successfully installed regex-2023.12.25 tokenizers-0.15.1 transformers-4.37.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39

In [1]:
from datasets import load_dataset
import os
import math

In [2]:
wiki = load_dataset("wikitext","wikitext-2-raw-v1")

In [3]:
wiki

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [4]:
wiki['train']

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [5]:
from transformers import AutoTokenizer

In [6]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

In [7]:
tokenizer

GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [8]:
wiki['train'][19]

{'text': ' = = Development = = \n'}

In [9]:
wiki

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [10]:
os.environ["CUDA_VISIBLE_DEVICES"] = '6,7'

In [11]:
print(wiki['train']['text'][19:21])


[' = = Development = = \n', '']


In [12]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples['text']])

In [13]:
tokenized_wiki = wiki.map(
    preprocess_function,
    batched=True,
    num_proc= 4,
    remove_columns=wiki['train'].column_names
)

In [14]:
test_val = 19
print(tokenized_wiki['train']['input_ids'][test_val])
print(tokenized_wiki['train']['attention_mask'][test_val])

[220, 796, 220, 220, 796, 220, 220, 360, 304, 410, 304, 300, 267, 279, 285, 304, 299, 256, 220, 220, 796, 220, 220, 796, 220, 220, 220, 198]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [15]:
block_size = 128

In [16]:
def group_texts(examples):
    
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [17]:
lm_dataset = tokenized_wiki.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/36718 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3760 [00:00<?, ? examples/s]

In [18]:
from transformers import DataCollatorForLanguageModeling

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fd0b264d3a0>>
Traceback (most recent call last):
  File "/home/snoronha/projects/244/assignment2/env/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [19]:
tokenizer.pad_token = tokenizer.eos_token

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [21]:
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='distilgpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [22]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [23]:
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

KeyboardInterrupt: 

In [None]:
training_args = TrainingArguments(
    output_dir='training_dir',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)


In [25]:
os.environ['NCCL_SHM_DISABLE']='1'

In [28]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") # perplexity without training


Perplexity: 22.72


In [29]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,1.3534,1.288102
2,1.2792,1.232189
3,1.2538,1.219


TrainOutput(global_step=38013, training_loss=1.3539684697465464, metrics={'train_runtime': 2044.3574, 'train_samples_per_second': 148.747, 'train_steps_per_second': 18.594, 'total_flos': 9932281435127808.0, 'train_loss': 1.3539684697465464, 'epoch': 3.0})

In [30]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 3.38


In [83]:
prompt = "Whats language without the inherent"

In [84]:
from transformers import pipeline

In [85]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device="cpu")

In [86]:
generator(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Whats language without the inherent y   e x p e r i e n c e   i n   a   v e r y   o f   W e n d s   s i n c e'}]

# Finetune this model for relational tagging

In [24]:
import torch
import transformers
from transformers import AutoModelForSequenceClassification

In [25]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.softmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [26]:
## using last assignment tokenization 

In [27]:
!pip3 install torchtext


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [132]:
import pandas as pd
import numpy as np
import torch
import os
import torchtext
from sklearn.model_selection import train_test_split
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from collections import Counter, OrderedDict
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import vocab
from torch import nn
from tqdm import tqdm 
import datasets
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from datasets import Dataset, DatasetDict

In [133]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')

In [134]:
os.environ["CUDA_VISIBLE_DEVICES"] = "*"

In [135]:
torch.cuda.device_count()

0

In [136]:
df = pd.read_csv('hw1_train.csv')

In [137]:
df.head()

Unnamed: 0,utterances,IOB Slot tags,Core Relations
0,who plays luke on star wars new hope,O O B_char O B_movie I_movie I_movie I_movie,movie.starring.actor movie.starring.character
1,show credits for the godfather,O O O B_movie I_movie,movie.starring.actor
2,who was the main actor in the exorcist,O O O O O O B_movie I_movie,movie.starring.actor
3,who played dory on finding nemo,O O B_char O B_movie I_movie,movie.starring.actor movie.starring.character
4,who was the female lead in resident evil,O O O O O O B_movie I_movie,movie.starring.actor actor.gender


### creating the vocab for the model

In [138]:
counter = Counter(set([val for row in df['utterances'].to_list() for val in row.split(' ')]))

In [139]:
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True) 

In [140]:
unk_token = '<unk>'
default_index = 5000

In [141]:
tokens = list(set([val for row in df['utterances'].to_list() for val in row.split(' ')]))

In [142]:
tokens[:10]

['lion',
 'berry',
 'should',
 'charles',
 'friday',
 'la',
 'informaion',
 'warner',
 'starting',
 'rudy']

In [143]:
df.head()

Unnamed: 0,utterances,IOB Slot tags,Core Relations
0,who plays luke on star wars new hope,O O B_char O B_movie I_movie I_movie I_movie,movie.starring.actor movie.starring.character
1,show credits for the godfather,O O O B_movie I_movie,movie.starring.actor
2,who was the main actor in the exorcist,O O O O O O B_movie I_movie,movie.starring.actor
3,who played dory on finding nemo,O O B_char O B_movie I_movie,movie.starring.actor movie.starring.character
4,who was the female lead in resident evil,O O O O O O B_movie I_movie,movie.starring.actor actor.gender


In [144]:
df['Core Relations'].fillna('None', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Core Relations'].fillna('None', inplace=True)


In [145]:
core_rels_list = sorted(list(set([j for i in df["Core Relations"].to_list() for j in str(i).split(" ")])))

In [146]:
core_rels_to_index_dict = {val: index for index, val in enumerate(sorted(core_rels_list))}
index_to_core_rels_dict = {index: val for val, index in core_rels_to_index_dict.items()}


In [147]:
def vectorize(text, token_to_index):
    one_hot_vector = [0 for i in range(len(token_to_index))]

    for token in str(text) if isinstance(text, float) else text.split(' '):
        one_hot_vector[token_to_index[token]] = 1
    return one_hot_vector

In [148]:
vectorize('movie.gross_revenue', core_rels_to_index_dict)

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [149]:
gpt2 = AutoModelForSequenceClassification.from_pretrained(
    'training_dir/checkpoint-38000',
    num_labels=len(core_rels_list),
    problem_type="multi_label_classification",  # this is important
)

Some weights of the model checkpoint at training_dir/checkpoint-38000 were not used when initializing GPT2ForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at training_dir/checkpoint-38000 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [150]:
df['core_rel_vectorized'] = df['Core Relations'].apply(lambda x: vectorize(x, core_rels_to_index_dict))

In [151]:
df.head()

Unnamed: 0,utterances,IOB Slot tags,Core Relations,core_rel_vectorized
0,who plays luke on star wars new hope,O O B_char O B_movie I_movie I_movie I_movie,movie.starring.actor movie.starring.character,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,show credits for the godfather,O O O B_movie I_movie,movie.starring.actor,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,who was the main actor in the exorcist,O O O O O O B_movie I_movie,movie.starring.actor,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,who played dory on finding nemo,O O B_char O B_movie I_movie,movie.starring.actor movie.starring.character,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,who was the female lead in resident evil,O O O O O O B_movie I_movie,movie.starring.actor actor.gender,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [152]:
def tokenize_and_encode(examples):
    return tokenizer(examples["utterances"], truncation=True)

In [153]:
df.drop(columns=['IOB Slot tags', 'Core Relations'], inplace=True)

In [154]:
df.rename(columns={
    'core_rel_vectorized':'labels'
},inplace=True)


In [155]:
df

Unnamed: 0,utterances,labels
0,who plays luke on star wars new hope,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,show credits for the godfather,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,who was the main actor in the exorcist,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,who played dory on finding nemo,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,who was the female lead in resident evil,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
2248,revenue for titanic,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2249,total titanic revenues,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2250,what was the revenue for toy story 3,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2251,dark knight revenue,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [156]:
test_df = pd.read_csv('hw2_test.csv')
test_df.head()

Unnamed: 0,ID,utterances
0,1,star of thor
1,2,who is in the movie the campaign
2,3,list the cast of the movie the campaign
3,4,who was in twilight
4,5,who is in vulguria


In [157]:
test_df = test_df[['utterances']]

In [158]:
test_df

Unnamed: 0,utterances
0,star of thor
1,who is in the movie the campaign
2,list the cast of the movie the campaign
3,who was in twilight
4,who is in vulguria
...,...
976,trailer for star wars a new hope
977,show resident evil movies with trailers
978,can i see previews for upcoming warner brother...
979,how many woody allen movies are set in new yor...


In [159]:
train_df, valid_ds = train_test_split(df, test_size=0.2)

In [160]:
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_ds)
test_ds = Dataset.from_pandas(test_df)

In [161]:
ds = DatasetDict()

In [162]:
ds['train'] = train_ds
ds['valid'] = valid_ds


In [163]:
ds

DatasetDict({
    train: Dataset({
        features: ['utterances', 'labels', '__index_level_0__'],
        num_rows: 1802
    })
    valid: Dataset({
        features: ['utterances', 'labels', '__index_level_0__'],
        num_rows: 451
    })
})

In [164]:
ds

DatasetDict({
    train: Dataset({
        features: ['utterances', 'labels', '__index_level_0__'],
        num_rows: 1802
    })
    valid: Dataset({
        features: ['utterances', 'labels', '__index_level_0__'],
        num_rows: 451
    })
})

In [165]:
# cast label IDs to floats
ds.set_format("torch")
ds = (ds
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

Map:   0%|          | 0/1802 [00:00<?, ? examples/s]

Map:   0%|          | 0/451 [00:00<?, ? examples/s]

In [166]:
ds['test'] = test_ds

In [167]:
ds  = ds.map(tokenize_and_encode, batched=True)

Map:   0%|          | 0/1802 [00:00<?, ? examples/s]

Map:   0%|          | 0/451 [00:00<?, ? examples/s]

Map:   0%|          | 0/981 [00:00<?, ? examples/s]

In [168]:
ds["train"][0]

{'utterances': 'when was in july released',
 '__index_level_0__': tensor(311),
 'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.]),
 'input_ids': tensor([12518,   373,   287,   474,  2062,  2716]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1])}

In [169]:
gpt2.config.pad_token_id = gpt2.config.eos_token_id

In [76]:
def compute_metrics(p):
    predictions = p.predictions > 0.5  # Assuming threshold for classification is 0.5, adjust as needed
    labels = p.label_ids

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='micro')
    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [77]:
args = TrainingArguments("testtraining", num_train_epochs=1)

trainer = Trainer(model=gpt2, args=args, train_dataset=ds["train"], eval_dataset=ds['valid'], tokenizer=tokenizer, compute_metrics=compute_metrics)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [78]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=226, training_loss=0.040479731770743314, metrics={'train_runtime': 50.6504, 'train_samples_per_second': 35.577, 'train_steps_per_second': 4.462, 'total_flos': 5299711441920.0, 'train_loss': 0.040479731770743314, 'epoch': 1.0})

In [79]:
trainer.evaluate()

{'eval_loss': 0.05519789084792137,
 'eval_accuracy': 0.7649667405764967,
 'eval_precision': 0.9391727493917275,
 'eval_recall': 0.7909836065573771,
 'eval_f1': 0.8587319243604005,
 'eval_runtime': 1.5561,
 'eval_samples_per_second': 289.832,
 'eval_steps_per_second': 36.631,
 'epoch': 1.0}

In [171]:
predictions = trainer.predict(ds['test'])

In [175]:
len(predictions.predictions)

981

In [1]:
predictions = predictions.predictions > 0.5  # Assuming threshold for classification is 0.5, adjust as needed



NameError: name 'predictions' is not defined