In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import pandas as pd
import datasets
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# modelName = "mistralai/Mistral-7B-v0.1"
modelName = "mistralai/Mistral-7B-Instruct-v0.2"

model = AutoModelForCausalLM.from_pretrained(modelName, device_map="auto", cache_dir = "/Users/Charlie/Documents/GitHub/DL2/Project")

In [4]:
tokenizer = AutoTokenizer.from_pretrained(modelName)

In [24]:
base_prompt_template: str = """
    You are a financial analyst designed to answer questions about business and finance. Your job is to reply to questions about finance topics and provide advice.

    Question: {query}
    
    Answer:
"""


query = "What's the difference between a stock and an option?"
query = base_prompt_template.format(query=query)

In [45]:
my_data = pd.read_csv("/Users/Charlie/Documents/GitHub/DL2/Project/cleaned_finance.csv", dtype=str)

In [46]:
hf_dataset = load_dataset("gbharti/finance-alpaca", split="train").to_pandas()

In [49]:
hf_dataset.iloc[1].input

''

In [19]:
hf_dataset.head()

Unnamed: 0,instruction,output,text,input
0,"For a car, what scams can be plotted with 0% f...",The car deal makes money 3 ways. If you pay in...,,
1,Why does it matter if a Central Bank has a neg...,"That is kind of the point, one of the hopes is...",,
2,Where should I be investing my money?,"Pay off your debt. As you witnessed, no ""inve...",,
3,Specifically when do options expire?,"Equity options, at least those traded in the A...",,
4,Negative Balance from Automatic Options Exerci...,"Automatic exercisions can be extremely risky, ...",,


In [20]:
my_data.drop(columns=['split', 'input'], inplace=True)

In [21]:
# add a split and input column filled with empty strings
my_data['text'] = ""
my_data['input'] = ""

In [22]:
my_data.head()

Unnamed: 0,question,answer,text,input
0,Provide a list of items that can be reused or ...,1. Plastic bottles 2. Cardboard boxes 3. Alumi...,,
1,Convert the given Celsius temperature to Fahre...,21.16 Fahrenheit.,,
2,Create a timeline of the life of Muhammad Ali.,"1942: Born in Louisville, Kentucky.\n1960: Won...",,
3,Construct a game involving the given settings.,The game involves a player taking on the role ...,,
4,Online brokers with a minimum stock purchase l...,With InteractiveBrokers there is no minimum tr...,,


In [25]:
def template(inputText):
    return base_prompt_template.format(query=inputText)
my_data["question"] = my_data["question"].apply(template)


In [None]:
hf_dataset

In [30]:
my_data = my_data.astype({'question':'str', 'answer':'str', 'text':'str', 'input':'str'})

In [31]:
my_data.dtypes

question    object
answer      object
text        object
input       object
dtype: object

In [36]:
hf_dataset.dtypes

instruction    object
output         object
text           object
input          object
dtype: object

In [32]:
my_data.iloc[0]['answer']

'1. Plastic bottles 2. Cardboard boxes 3. Aluminum cans 4. Newspapers 5. Glass jars 6. Clothing 7. Furniture 8. Bicycles 9. Computers 10. Printer cartridges 11. Electronics 12. Building materials 13. Batteries 14. Household appliances 15. Car batteries 16. Paper towels 17. CDs and DVDs 18. Bedding 19. Shipping pallets'

In [33]:
# setting pad token
tokenizer.pad_token = tokenizer.eos_token
# data collator
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [39]:
def tokenize_input_output(example):
    return tokenizer(example["instruction"], example["output"], padding="max_length", truncation=True, return_tensors="pt", max_length=550)

In [37]:
my_dataset = datasets.Dataset.from_pandas(my_data).map(tokenize_input_output, batched=True)

Map: 100%|██████████| 56852/56852 [00:15<00:00, 3575.24 examples/s]


In [40]:
hf_dataset = datasets.Dataset.from_pandas(hf_dataset).map(tokenize_input_output, batched=True)

Map: 100%|██████████| 68912/68912 [00:19<00:00, 3620.18 examples/s]


In [41]:
my_dataset.features

{'question': Value(dtype='string', id=None),
 'answer': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'input': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [42]:
hf_dataset.features

{'instruction': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'input': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [43]:
my_dataset[0]

{'question': '\n    You are a financial analyst designed to answer questions about business and finance. Your job is to reply to questions about finance topics and provide advice.\n\n    Question: Provide a list of items that can be reused or recycled.\n    \n    Answer:\n',
 'answer': '1. Plastic bottles 2. Cardboard boxes 3. Aluminum cans 4. Newspapers 5. Glass jars 6. Clothing 7. Furniture 8. Bicycles 9. Computers 10. Printer cartridges 11. Electronics 12. Building materials 13. Batteries 14. Household appliances 15. Car batteries 16. Paper towels 17. CDs and DVDs 18. Bedding 19. Shipping pallets',
 'text': '',
 'input': '',
 'input_ids': [2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
 

In [44]:
hf_dataset[0]

{'instruction': 'For a car, what scams can be plotted with 0% financing vs rebate?',
 'output': "The car deal makes money 3 ways. If you pay in one lump payment. If the payment is greater than what they paid for the car, plus their expenses, they make a profit. They loan you the money. You make payments over months or years, if the total amount you pay is greater than what they paid for the car, plus their expenses, plus their finance expenses they make money. Of course the money takes years to come in, or they sell your loan to another business to get the money faster but in a smaller amount. You trade in a car and they sell it at a profit. Of course that new transaction could be a lump sum or a loan on the used car... They or course make money if you bring the car back for maintenance, or you buy lots of expensive dealer options. Some dealers wave two deals in front of you: get a 0% interest loan. These tend to be shorter 12 months vs 36,48,60 or even 72 months. The shorter length ma

In [25]:
dataset2 = dataset.train_test_split(test_size = 0.2)

In [26]:
dataset2['train']

Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask'],
    num_rows: 45481
})

In [27]:
model.train()
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(r=16, lora_alpha=32, bias="none", task_type="CASUAL_LM")

model = get_peft_model(model, config)

model.print_trainable_parameters()

trainable params: 6,815,744 || all params: 7,248,547,840 || trainable%: 0.0940290959023318


In [28]:
lr = 2e-4
batch_size = 4
num_epochs = 4

# define training arguments
training_args = transformers.TrainingArguments(
    output_dir= "/home/hice1/ckniffin6/scratch/training",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    fp16=True,
    optim="paged_adamw_8bit",

)


In [29]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset2['train'],
    eval_dataset=dataset2["test"],
    args=training_args,
    data_collator=data_collator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [30]:
# train model
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

# renable warnings
# model.config.use_cache = True


trainer.save_model("/home/hice1/ckniffin6/scratch/saved")

  return table.fast_gather(key % table.num_rows)


IndexError: list index out of range