<a href="https://colab.research.google.com/github/acram002/AI-Driven-Recipe-Suggestion-System/blob/main/Copy_of_DeepseekTrainColabPro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi


Tue Apr 22 01:45:01 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             53W /  400W |   30081MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
#4
# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load and sample dataset
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')
df = df[['NER', 'directions']].dropna()
df = df.sample(n=100, random_state=42)  # smaller dataset for quick run

df['text'] = 'Generate a recipe:\nIngredients: ' + df['NER'] + '\n' + df['directions']

# STEP 4: Hugging Face Dataset + Tokenization
from datasets import Dataset
dataset = Dataset.from_pandas(df[['text']])

from transformers import AutoTokenizer

model_name = 'deepseek-ai/DeepSeek-V2-Lite'
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 128

def tokenize(example):
    tokens = tokenizer(example['text'], truncation=True, padding='max_length', max_length=max_length)
    tokens['labels'] = tokens['input_ids'].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# STEP 5: Load model with offloading + checkpointing
import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",                # ✅ smart layer placement
    torch_dtype=torch.bfloat16,       # ✅ efficient precision for A100
    offload_folder="/content/offload" # ✅ disk offloading if needed
)

model.gradient_checkpointing_enable()

# STEP 6: Manual training loop
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

train_loader = DataLoader(tokenized_dataset, batch_size=1, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        # Move inputs to the correct device (model.device works with device_map="auto")
        for k in batch:
            batch[k] = batch[k].to(model.device)

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        torch.cuda.empty_cache()  # ✅ clears cached memory
        loop.set_postfix(loss=loss.item())

# STEP 7: Inference (also using model.device properly)
model.eval()

sample_prompt = df.iloc[0]['text'].split('\n')[0] + '\n' + df.iloc[0]['text'].split('\n')[1]
sample_target = df.iloc[0]['text'].split('\n', 2)[2]

inputs = tokenizer(sample_prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    output = model.generate(**inputs, max_length=256)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("\n=== Sample Prompt from Training ===")
print(sample_prompt)
print("\n=== Target Response ===")
print(sample_target)
print("\n=== Model Generated ===")
print(decoded_output)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

The repository for deepseek-ai/DeepSeek-V2-Lite contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/deepseek-ai/DeepSeek-V2-Lite.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


configuration_deepseek.py:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite:
- configuration_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for deepseek-ai/DeepSeek-V2-Lite contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/deepseek-ai/DeepSeek-V2-Lite.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


modeling_deepseek.py:   0%|          | 0.00/78.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite:
- modeling_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00004-of-000004.safetensors:   0%|          | 0.00/5.64G [00:00<?, ?B/s]

model-00001-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00003-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]



In [None]:
# 3 will work but may take 1-3 hours

# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load and sample dataset
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')
df = df[['NER', 'directions']].dropna()
df = df.sample(n=100, random_state=42)  # smaller size for safety

df['text'] = 'Generate a recipe:\nIngredients: ' + df['NER'] + '\n' + df['directions']

# STEP 4: Hugging Face Dataset + Tokenization
from datasets import Dataset
dataset = Dataset.from_pandas(df[['text']])

from transformers import AutoTokenizer

model_name = 'deepseek-ai/deepseek-llm-7b-chat'
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_length = 128

def tokenize(example):
    tokens = tokenizer(example['text'], truncation=True, padding='max_length', max_length=max_length)
    tokens['labels'] = tokens['input_ids'].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# STEP 5: Load model with offloading + checkpointing
import torch
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",                # ✅ smart layer placement
    torch_dtype=torch.bfloat16,       # ✅ efficient precision
    offload_folder="/content/offload" # ✅ will use disk if needed
)

model.gradient_checkpointing_enable()

# STEP 6: Manual training loop
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

train_loader = DataLoader(tokenized_dataset, batch_size=1, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        # Move inputs to the right device
        for k in batch:
            batch[k] = batch[k].to(model.device)

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        torch.cuda.empty_cache()  # ✅ prevents accumulation
        loop.set_postfix(loss=loss.item())

# STEP 7: Inference (also using model.device properly)
model.eval()

sample_prompt = df.iloc[0]['text'].split('\n')[0] + '\n' + df.iloc[0]['text'].split('\n')[1]
sample_target = df.iloc[0]['text'].split('\n', 2)[2]

inputs = tokenizer(sample_prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    output = model.generate(**inputs, max_length=256)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("\n=== Sample Prompt from Training ===")
print(sample_prompt)
print("\n=== Target Response ===")
print(sample_target)
print("\n=== Model Generated ===")
print(decoded_output)

# STEP 8: Save model and tokenizer to Google Drive
save_path = "/content/drive/MyDrive/deepseek-llm-7b-chat"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n✅ Model and tokenizer saved to: {save_path}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   1%|          | 1/100 [02:13<3:40:31, 133.65s/it, loss=2.23]


KeyboardInterrupt: 

In [None]:
# 1
# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load the dataset
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')

# STEP 4: Clean and reduce dataset
df = df[['NER', 'directions']].dropna() #[['title', 'ingredients', 'directions']].dropna()
df = df.sample(n=10000, random_state=42)

# STEP 5: Prepare data in prompt-response format
#df['prompt'] = 'Generate a recipe:\nIngredients: ' + df['NER']
#df['response'] = df['directions']
df['text'] = 'Generate a recipe:\nIngredients: ' + df['NER'] + '\n' + df['directions']

# STEP 6: Convert to HuggingFace Dataset
from datasets import Dataset

dataset = Dataset.from_pandas(df[['text']])

# STEP 7: Tokenize the data
from transformers import AutoTokenizer

model_name = 'deepseek-ai/deepseek-llm-7b-chat'#'t5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    tokens = tokenizer(
        example['text'],
        truncation=True,
        padding='max_length',
        max_length=256
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


tokenized_dataset = dataset.map(tokenize, batched=True)

# STEP 8: Load the model
import torch
from transformers import AutoModelForCausalLM
#model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16   # Mixed precision on GPU
)


# STEP 9: Training setup
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    bf16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# STEP 10: Train the model
trainer.train()
# Grab an actual training example to test generation
sample_prompt = df.iloc[0]['text'].split('\n')[0] + '\n' + df.iloc[0]['text'].split('\n')[1]  # Just the prompt part
sample_target = df.iloc[0]['text'].split('\n', 2)[2]  # The response part

inputs = tokenizer(sample_prompt, return_tensors="pt", truncation=True).to(model.device)
output = model.generate(**inputs, max_length=512)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("\n=== Sample Prompt from Training ===")
print(sample_prompt)
print("\n=== Target Response ===")
print(sample_target)
print("\n=== Model Generated ===")
print(decoded_output)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  trainer = Trainer(


RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [None]:
# 2
# STEP 1: Install dependencies
!pip install -q transformers datasets accelerate

# STEP 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: Load the dataset
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/full_dataset.csv')

# STEP 4: Clean and reduce dataset
df = df[['NER', 'directions']].dropna()
df = df.sample(n=1000, random_state=42)  # Keep small for testing

# STEP 5: Combine prompt and response
df['text'] = 'Generate a recipe:\nIngredients: ' + df['NER'] + '\n' + df['directions']

# STEP 6: Convert to HuggingFace Dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df[['text']])

# STEP 7: Tokenize the data
from transformers import AutoTokenizer

model_name = 'deepseek-ai/DeepSeek-V2-Lite'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    tokens = tokenizer(
        example['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# STEP 8: Load the model (offloading enabled for large models)
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    offload_folder="/content/drive/MyDrive/offload"
)

# STEP 9: Manual Training Loop
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = DataLoader(tokenized_dataset, batch_size=1, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(1):
    loop = tqdm(train_loader, desc="Epoch 1")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

# STEP 10: Inference Example
model.eval()

sample_prompt = df.iloc[0]['text'].split('\n')[0] + '\n' + df.iloc[0]['text'].split('\n')[1]
sample_target = df.iloc[0]['text'].split('\n', 2)[2]

inputs = tokenizer(sample_prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    output = model.generate(**inputs, max_length=512)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print("\n=== Sample Prompt from Training ===")
print(sample_prompt)
print("\n=== Target Response ===")
print(sample_target)
print("\n=== Model Generated ===")
print(decoded_output)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

The repository for deepseek-ai/DeepSeek-V2-Lite contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/deepseek-ai/DeepSeek-V2-Lite.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1000 [17:37<?, ?it/s]


KeyboardInterrupt: 