<a href="https://colab.research.google.com/github/abojha/AbOjha/blob/main/DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import subprocess

In [None]:
def load_and_preprocess_data(csv_path="preprocessed_data.csv"):
    # Check if preprocessed CSV exists
    if os.path.exists(csv_path):
        print(f"Loading preprocessed data from {csv_path}")
        return pd.read_csv(csv_path)

    # Load and preprocess fresh data
    print("Preprocessing dataset...")
    dataset = load_dataset("sahil2801/CodeAlpaca-20k", split="train")

    # Convert to pandas DataFrame and format Q&A pairs
    df = dataset.to_pandas()
    df["question"] = "Question: " + df["instruction"] + " " + df["input"].apply(
        lambda x: x if str(x).strip() != "" else ""
    )
    df["answer"] = "Answer: " + df["output"]
    df = df[["question", "answer"]].dropna().drop_duplicates()

    # Save to CSV
    df.to_csv(csv_path, index=False)
    print(f"Saved preprocessed data to {csv_path}")
    return df

In [None]:
df = load_and_preprocess_data()

Loading preprocessed data from preprocessed_data.csv


In [None]:
df = df.head(5000)  # Select first 1000 rows


In [None]:
def initialize_model():
    model_path = "./codet5-code-assistant"
    if os.path.exists(model_path):
        print("Loading fine-tuned model")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    else:
        print("Loading base model")
        model_name = "Salesforce/codet5-base"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return tokenizer, model

In [None]:
# 3. Data Tokenization
# --------------------
def tokenize_data(df, tokenizer):
    def tokenize_function(examples):
        inputs = tokenizer(
            examples["question"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        outputs = tokenizer(
            examples["answer"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
        return {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": outputs.input_ids
        }

    dataset = Dataset.from_pandas(df)
    return dataset.map(tokenize_function, batched=True)

# Initialize model and tokenizer
tokenizer, model = initialize_model()

Loading fine-tuned model


In [None]:
# Tokenize data
tokenized_dataset = tokenize_data(df, tokenizer)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
# 4. Model Training
# --------------------
def train_model(tokenized_dataset, model):
    # Split dataset
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
    train_dataset, val_dataset = split_dataset["train"], split_dataset["test"]

    # Training configuration
    training_args = TrainingArguments(
        output_dir="./codet5-code-assistant",
        num_train_epochs=5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        report_to="none"
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Start training
    trainer.train()
    return model


In [None]:
# Train model (comment out after first training)
model = train_model(tokenized_dataset, model)



Epoch,Training Loss,Validation Loss
1,0.5961,0.514975
2,0.4546,0.484174
3,0.3379,0.481812
4,0.2026,0.488417
5,0.1372,0.504511


('./codet5-code-assistant/tokenizer_config.json',
 './codet5-code-assistant/special_tokens_map.json',
 './codet5-code-assistant/vocab.json',
 './codet5-code-assistant/merges.txt',
 './codet5-code-assistant/added_tokens.json',
 './codet5-code-assistant/tokenizer.json')

In [None]:
model.save_pretrained("/content/drive/MyDrive/dl/my_model")
tokenizer.save_pretrained("/content/drive/MyDrive/dl/my_model")

('/content/drive/MyDrive/dl/my_model/tokenizer_config.json',
 '/content/drive/MyDrive/dl/my_model/special_tokens_map.json',
 '/content/drive/MyDrive/dl/my_model/vocab.json',
 '/content/drive/MyDrive/dl/my_model/merges.txt',
 '/content/drive/MyDrive/dl/my_model/added_tokens.json',
 '/content/drive/MyDrive/dl/my_model/tokenizer.json')

In [3]:
# --------------------
# 5. Inference & Chat
# --------------------
class CodeAssistant:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.model.eval()

    def generate_answer(self, question):
        input_text = f"Question: {question}"
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(input_ids, max_length=128)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def execute_code(self, code):
        try:
            with open("temp.py", "w") as f:
                f.write(code)
            result = subprocess.run(["python", "temp.py"], capture_output=True, text=True)
            return f"Output:\n{result.stdout}\nErrors:\n{result.stderr}"
        except Exception as e:
            return f"Error: {str(e)}"

    def respond(self, question):
        answer = self.generate_answer(question)
        if "```python" in answer:
            code = answer.split("```python")[1].split("```")[0].strip()
            execution_result = self.execute_code(code)
            return f"{answer}\n\n{execution_result}"
        return answer

# --------------------
# 6. Gradio Interface
# --------------------
def launch_interface(assistant):
    return gr.Interface(
        fn=assistant.respond,
        inputs=gr.Textbox(lines=2, placeholder="Ask a coding question..."),
        outputs="text",
        title="Code Assistant",
        examples=[
            ["Write a Python function to reverse a string"],
            ["How to calculate factorial in JavaScript?"],
            ["Explain recursion with an example"]
        ]
    )

In [8]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.20.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [7]:
# Initialize assistant
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_path = "/content/drive/MyDrive/dl/my_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

assistant = CodeAssistant(model, tokenizer)

# Launch interface
interface = launch_interface(assistant)
interface.launch()

ModuleNotFoundError: No module named 'gradio'