In [1]:
# --- 1. Installation ---
!pip install --upgrade "transformers>=4.41.0" "peft>=0.11.1" "trl>=0.8.6" "accelerate" "bitsandbytes" "datasets<4.0.0" -q
!pip install "pydantic>=2.0" "beautifulsoup4" "lxml" "langchain" "langchain_community" "langchain_huggingface" "sec-edgar-api" -q

# --- 2. Imports, Login, and Path Setup ---
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
import os

print("--- Step 1: Mounting Google Drive ---")
drive.mount('/content/drive')

print("\n--- Step 2: Logging into Hugging Face ---")
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("Successfully logged into Hugging Face.")
except Exception as e:
    print(f"Could not log into Hugging Face. Error: {e}")

print("\n--- Step 3: Defining Paths and Configuration ---")
DRIVE_BASE_PATH = "/content/drive/My Drive/financial_distillation_project"
DATA_PATH = os.path.join(DRIVE_BASE_PATH, "data", "10-K")
MODEL_SAVE_PATH = os.path.join(DRIVE_BASE_PATH, "models")
DATASET_FILE_PATH = os.path.join(DRIVE_BASE_PATH, "rationale_dataset_final.jsonl")
USER_AGENT = "Your Name your.email@example.com"  # Replace with your information
TARGET_COMPANIES = {"AAPL": "320193", "MSFT": "789019", "GOOGL": "1652044"}
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
print("Setup complete.")

--- Step 1: Mounting Google Drive ---
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

--- Step 2: Logging into Hugging Face ---
Successfully logged into Hugging Face.

--- Step 3: Defining Paths and Configuration ---
Setup complete.


In [2]:
import requests
from sec_edgar_api import EdgarClient
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from datasets import Dataset

def download_10k_filings():
    print("Starting download of clean text 10-K filings...")
    edgar_client = EdgarClient(user_agent=USER_AGENT)
    for ticker, cik in TARGET_COMPANIES.items():
        try:
            print(f"\nProcessing {ticker} (CIK: {cik})...")
            filings = edgar_client.get_submissions(cik=cik)
            recent_filings = filings['filings']['recent']
            for i in range(len(recent_filings['form'])):
                if recent_filings['form'][i] == '10-K' and "2023-01-01" <= recent_filings['filingDate'][i]:
                    accession_no = recent_filings['accessionNumber'][i]
                    accession_no_simple = accession_no.replace('-', '')
                    filing_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_no_simple}/{accession_no}.txt"
                    print(f"  Downloading full text for filing {accession_no}...")
                    response = requests.get(filing_url, headers={'User-Agent': USER_AGENT})
                    response.raise_for_status()
                    ticker_path = os.path.join(DATA_PATH, ticker)
                    os.makedirs(ticker_path, exist_ok=True)
                    file_path = os.path.join(ticker_path, f"{accession_no}.txt")
                    with open(file_path, 'w', encoding='utf-8') as f: f.write(response.text)
                    print(f"  Successfully saved to {file_path}")
                    break
        except Exception as e: print(f"An error occurred while processing {ticker}: {e}")
    print("\nDownload process completed.")

download_10k_filings()

SAMPLE_DOC_PATH = "/content/drive/My Drive/financial_distillation_project/data/10-K/AAPL/0000320193-24-000123.txt"
loader = TextLoader(SAMPLE_DOC_PATH, encoding='utf-8')
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
chunks = text_splitter.split_documents(docs)
def is_prose(text: str) -> bool:
    return len(text.split()) > 30 and text.count('<') < 5 and text.count('/') < 5
prose_chunks = [chunk.page_content for chunk in chunks if is_prose(chunk.page_content)]
print(f"Found {len(prose_chunks)} prose chunks after filtering.")

hf_dataset = Dataset.from_dict({"text": prose_chunks})

Starting download of clean text 10-K filings...

Processing AAPL (CIK: 320193)...
  Downloading full text for filing 0000320193-24-000123...
  Successfully saved to /content/drive/My Drive/financial_distillation_project/data/10-K/AAPL/0000320193-24-000123.txt

Processing MSFT (CIK: 789019)...
  Downloading full text for filing 0000950170-25-100235...
  Successfully saved to /content/drive/My Drive/financial_distillation_project/data/10-K/MSFT/0000950170-25-100235.txt

Processing GOOGL (CIK: 1652044)...
  Downloading full text for filing 0001652044-25-000014...
  Successfully saved to /content/drive/My Drive/financial_distillation_project/data/10-K/GOOGL/0001652044-25-000014.txt

Download process completed.
Found 130 prose chunks after filtering.


In [26]:
import torch
import json
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain_core.prompts import PromptTemplate
from langchain_huggingface import HuggingFacePipeline
from langchain_core.output_parsers import StrOutputParser

# --- 1. Load the Teacher Model ---
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
teacher_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
llm_pipeline_raw = pipeline("text-generation", model=teacher_model, tokenizer=tokenizer, max_new_tokens=1024, return_full_text=False)
llm_pipeline = HuggingFacePipeline(pipeline=llm_pipeline_raw)
print("Teacher model loaded successfully.")

# --- 2. Generate the Dataset ---
class RationaleSummary(BaseModel):
    rationale: str = Field(description="The step-by-step reasoning for the summary.")
    summary: str = Field(description="A concise summary of the key information.")

parser = PydanticOutputParser(pydantic_object=RationaleSummary)
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are an expert financial analyst. Your task is to provide a structured JSON response. Do not provide any explanation or conversational text outside of the JSON object.
{format_instructions}<|eot_id|><|start_header_id|>user<|end_header_id|>
Analyze the following text from a 10-K filing:
<TEXT_TO_PROCESS>
{text}
</TEXT_TO_PROCESS><|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
prompt = PromptTemplate.from_template(prompt_template, partial_variables={"format_instructions": parser.get_format_instructions()})
final_chain = prompt | llm_pipeline | StrOutputParser()

print("\nStarting dataset generation...")
chunks_to_process = hf_dataset.select(range(100))
dataset_list = []
for chunk in tqdm(chunks_to_process, desc="Generating Triplets"):
    try:
        raw_output = final_chain.invoke({"text": chunk['text']})
        parsed_output = parser.parse(raw_output)
        dataset_list.append({
            "text": chunk['text'], "rationale": parsed_output.rationale, "summary": parsed_output.summary
        })
    except Exception as e:
        print(f"\nSkipping chunk due to error: {e}")
        continue
print(f"\nSuccessfully generated {len(dataset_list)} triplets.")
with open(DATASET_FILE_PATH, 'w') as f:
    for entry in dataset_list: f.write(json.dumps(entry) + '\n')
print(f"Final rationale dataset saved to: {DATASET_FILE_PATH}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


Teacher model loaded successfully.

Starting dataset generation...


Generating Triplets:   0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for


Successfully generated 100 triplets.
Final rationale dataset saved to: /content/drive/My Drive/financial_distillation_project/rationale_dataset_final.jsonl


In [27]:
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

# --- 1. Load Your Generated Dataset ---
dataset = load_dataset("json", data_files=DATASET_FILE_PATH, split="train")

def formatting_prompts_func(example):
    return f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Analyze the following text from a 10-K filing and provide a step-by-step rationale and a final summary.
Text:
{example['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Rationale:
{example['rationale']}
Summary:
{example['summary']}<|eot_id|>"""

# --- 2. Load the Student Model ---
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
student_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
student_tokenizer = AutoTokenizer.from_pretrained(model_name)
student_tokenizer.pad_token = student_tokenizer.eos_token
print("\nTinyLlama student model loaded.")

# --- 3. Add LoRA Adapters and Prepare for k-bit training ---
student_model = prepare_model_for_kbit_training(student_model)

lora_config = LoraConfig(
    r=16, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)
student_model = get_peft_model(student_model, lora_config)

# Explicitly cast trainable parameters to float32
for param in student_model.parameters():
    if param.requires_grad:
        param.data = param.data.to(torch.float32)


print("LoRA adapters applied and model prepared for k-bit training.")


# --- 4. Configure and Run the Trainer ---
# Use TrainingArguments for standard training arguments and pass max_seq_length and tokenizer to SFTTrainer
training_args = TrainingArguments(
    output_dir="outputs",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    logging_steps=1,
)

print("\nTrainingArguments:")
print(training_args)


trainer = SFTTrainer(
    model=student_model,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func, # Use the modern formatting_func
    args=training_args,
    # max_seq_length=2048, # Removed max_seq_length here
    # tokenizer=student_tokenizer, # Removed tokenizer here
    # packing=True, # Packing is efficient when using a formatting_func - REMOVED
)
print("\nStarting the fine-tuning process...")
trainer.train()
print("Fine-tuning complete.")

# --- 5. Save the Fine-Tuned Model ---
print(f"\nSaving fine-tuned model to: {MODEL_SAVE_PATH}")
trainer.save_model(MODEL_SAVE_PATH)
print("Model saved successfully.")

Generating train split: 0 examples [00:00, ? examples/s]


TinyLlama student model loaded.
LoRA adapters applied and model prepared for k-bit training.

TrainingArguments:
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_bat

Applying formatting function to train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]


Starting the fine-tuning process...




Step,Training Loss
1,1.3686
2,1.4606
3,1.5795
4,1.5207
5,1.5424
6,1.5418
7,1.446
8,1.2297
9,1.4858
10,1.3191


Fine-tuning complete.

Saving fine-tuned model to: /content/drive/My Drive/financial_distillation_project/models
Model saved successfully.


In [28]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# --- 1. Load the Base Model and Tokenizer ---
# We load the original TinyLlama model again
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)

base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# --- 2. Load the Fine-Tuned LoRA Adapters ---
# This merges your trained adapters into the base model
# The MODEL_SAVE_PATH variable should still be in your environment from the previous cells
print(f"Loading fine-tuned LoRA adapters from: {MODEL_SAVE_PATH}")
student_model = PeftModel.from_pretrained(base_model, MODEL_SAVE_PATH)
print("LoRA adapters loaded and merged successfully.")


# --- 3. Prepare an Inference Prompt ---
# This is a new, unseen piece of text to test the model's ability to generalize.
inference_prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Analyze the following text from a 10-K filing and provide a step-by-step rationale and a final summary.

Text:
Our industry is characterized by intense competition from numerous companies, some of which have greater financial, marketing, and technological resources. Increased competition could result in price reductions, reduced profitability, and loss of market share.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

# --- 4. Run Inference ---
print("\n--- Running Inference with Fine-Tuned Student Model ---")
inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")
outputs = student_model.generate(**inputs, max_new_tokens=256, use_cache=True)
response = tokenizer.batch_decode(outputs)

# Print the generated rationale and summary
print(response[0].split("<|start_header_id|>assistant<|end_header_id|>")[1])

Loading fine-tuned LoRA adapters from: /content/drive/My Drive/financial_distillation_project/models
LoRA adapters loaded and merged successfully.

--- Running Inference with Fine-Tuned Student Model ---


Step-by-step rationale:

1. Identify the industry and its competitors.
2. Determine the intensity of competition.
3. Identify the companies with greater financial, marketing, and technological resources.
4. Determine the potential impact of increased competition on profitability and market share.
5. Analyze the potential impact of price reductions, reduced profitability, and loss of market share.
6. Provide a rationale for the proposed action.

Summary:
Our industry is characterized by intense competition from numerous companies, some of which have greater financial, marketing, and technological resources. The proposed action is to implement price reductions, reduce profitability, and loss of market share to mitigate the potential impact of increased competition. The rationale for th

In [20]:
# Install the evaluate library
!pip install evaluate -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
# Install the rouge_score library
!pip install rouge_score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [30]:
import torch
from datasets import load_dataset
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from tqdm.auto import tqdm
import json
# NEW: Import the evaluate library for ROUGE scores
import evaluate

# --- 1. Load the Fine-Tuned Student Model ---
print("Loading the fine-tuned student model...")

# Load the base model
base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, quantization_config=quantization_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load the LoRA adapters and merge them into the base model
# This assumes MODEL_SAVE_PATH is available from your setup cell
student_model = PeftModel.from_pretrained(base_model, MODEL_SAVE_PATH)
print("Student model loaded successfully.")


# --- 2. Load the Dataset with Reference Summaries ---
# This loads the original dataset, which contains the teacher's summaries to compare against.
print(f"\nLoading dataset from: {DATASET_FILE_PATH}")
eval_dataset = load_dataset("json", data_files=DATASET_FILE_PATH, split="train")


# --- 3. Generate Predictions with the Student Model ---
print("\nGenerating summaries with the student model for evaluation...")
predictions = []
references = []

for example in tqdm(eval_dataset, desc="Generating Predictions"):
    # Prepare the prompt for inference (without the answer)
    prompt_for_inference = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Analyze the following text from a 10-K filing and provide a step-by-step rationale and a final summary.

Text:
{example['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
    inputs = tokenizer([prompt_for_inference], return_tensors="pt").to("cuda")

    # Generate the output
    outputs = student_model.generate(**inputs, max_new_tokens=256, use_cache=True)
    response_text = tokenizer.batch_decode(outputs)[0]

    # Extract just the summary part for evaluation
    try:
        # A simple way to extract the summary from the generated text
        summary_part = response_text.split("Summary:")[1]
        cleaned_summary = summary_part.split("<|eot_id|>")[0].strip()
        predictions.append(cleaned_summary)
        references.append(example['summary'])
    except IndexError:
        # If the model didn't follow the format, add an empty prediction
        predictions.append("")
        references.append(example['summary'])

print("Prediction generation complete.")


# --- 4. Calculate ROUGE Scores ---
print("\nCalculating ROUGE scores...")
rouge = evaluate.load('rouge')
results = rouge.compute(predictions=predictions, references=references)

print("\n--- Quantitative Evaluation Results ---")
print(json.dumps(results, indent=2))

Loading the fine-tuned student model...
Student model loaded successfully.

Loading dataset from: /content/drive/My Drive/financial_distillation_project/rationale_dataset_final.jsonl

Generating summaries with the student model for evaluation...


Generating Predictions:   0%|          | 0/100 [00:00<?, ?it/s]

Prediction generation complete.

Calculating ROUGE scores...

--- Quantitative Evaluation Results ---
{
  "rouge1": 0.00397203947368421,
  "rouge2": 0.0014285714285714286,
  "rougeL": 0.0038157894736842103,
  "rougeLsum": 0.00365953947368421
}
