In [None]:
!pip install openai python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [None]:
import os
import openai
import time

# Set your API key from environment variable

def upload_file():
    """Upload the dataset file to OpenAI"""
    print("Uploading dataset file...")
    with open("dataset.jsonl", "rb") as file:
        response = openai.File.create(
            file=file,
            purpose="fine-tune"
        )
    file_id = response.id
    print(f"File uploaded successfully. File ID: {file_id}")
    return file_id

def create_fine_tune(file_id):
    """Create a fine-tuning job using the uploaded file"""
    print("Creating fine-tuning job...")
    # The current OpenAI API doesn't accept n_epochs as a parameter for FineTuningJob.create
    # Instead, we use hyperparameters dictionary if we need to specify epochs
    response = openai.FineTuningJob.create(
        training_file=file_id,
        model="gpt-3.5-turbo",  # Base model to fine-tune
        hyperparameters={
            "n_epochs": 2  # Number of training epochs
        },
        suffix="hinglish-assistant"  # Custom suffix for the fine-tuned model name
    )
    job_id = response.id
    print(f"Fine-tuning job created. Job ID: {job_id}")
    return job_id

def check_status(job_id):
    """Check the status of the fine-tuning job"""
    print("Checking job status...")
    response = openai.FineTuningJob.retrieve(job_id)
    status = response.status
    print(f"Current status: {status}")
    return status

def main():
    # Step 1: Upload the dataset file
    file_id = upload_file()

    # Wait a moment to ensure the file is processed
    print("Waiting for file processing...")
    time.sleep(5)

    # Step 2: Create a fine-tuning job
    job_id = create_fine_tune(file_id)

    # Step 3: Check the initial status
    status = check_status(job_id)

    print("\nFine-tuning job has been started!")
    print("You can check the status of your fine-tuning job with:")
    print(f"openai.FineTuningJob.retrieve('{job_id}')")
    print("\nOnce completed, you can use your fine-tuned model with:")
    print("ft:gpt-3.5-turbo:[org]:hinglish-assistant:[suffix]")
    print("\nNote: The actual model name will be provided when the job completes.")

if __name__ == "__main__":
    main()

Uploading dataset file...
File uploaded successfully. File ID: file-2C9RQ87kKyEMjan7v85sde
Waiting for file processing...
Creating fine-tuning job...


InvalidRequestError: You exceeded your current quota, please check your plan and billing details.

In [2]:
# fine_tune.py - Script to fine-tune DistilBERT on Hinglish dataset
import json
import numpy as np
import torch
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
from torch.optim import AdamW  # Import AdamW from torch.optim instead of transformers
import torch.nn as nn
import torch.nn.functional as F

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("Loading dataset...")

# Load the dataset.jsonl file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Load dataset
dataset = load_jsonl('dataset.jsonl')
print(f"Loaded {len(dataset)} examples from dataset.jsonl")

# Extract prompts and completions
prompts = []
completions = []

for item in dataset:
    prompt = item['prompt']
    completion = item['completion']

    # Remove prefixes if they exist
    if "User: " in prompt:
        prompt = prompt.replace("User: ", "")
    if "Assistant: " in completion:
        completion = completion.replace("Assistant: ", "")

    prompts.append(prompt)
    completions.append(completion)

# Create a mapping dictionary for retrieval
prompt_completion_dict = dict(zip(prompts, completions))

# Save the mapping dictionary for inference
with open('prompt_completion_dict.json', 'w', encoding='utf-8') as f:
    json.dump(prompt_completion_dict, f, ensure_ascii=False, indent=2)

# Create positive examples (matching prompt-completion pairs)
positive_examples = []
for i in range(len(prompts)):
    positive_examples.append({
        'prompt': prompts[i],
        'completion': completions[i],
        'label': 1  # Positive example
    })

# Create negative examples (mismatched prompt-completion pairs)
negative_examples = []
for i in range(len(prompts)):
    # Select a random completion that's different from the current one
    other_indices = [j for j in range(len(completions)) if j != i]
    if other_indices:
        j = np.random.choice(other_indices)
        negative_examples.append({
            'prompt': prompts[i],
            'completion': completions[j],
            'label': 0  # Negative example
        })

# Combine positive and negative examples
all_examples = positive_examples + negative_examples
np.random.shuffle(all_examples)

# Split into training and validation sets
train_examples, val_examples = train_test_split(all_examples, test_size=0.2, random_state=42)

print(f"Training examples: {len(train_examples)}")
print(f"Validation examples: {len(val_examples)}")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Custom dataset class
class HinglishDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=128):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        prompt = example['prompt']
        completion = example['completion']
        label = example['label']

        # Tokenize the text pair
        encoding = self.tokenizer(
            prompt,
            completion,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        # Remove the batch dimension
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = HinglishDataset(train_examples, tokenizer)
val_dataset = HinglishDataset(val_examples, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# Define the model
class HinglishClassifier(nn.Module):
    def __init__(self):
        super(HinglishClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(768, 2)  # 768 is the hidden size of DistilBERT, 2 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # Take the [CLS] token representation
        logits = self.classifier(cls_output)
        return logits

# Initialize the model
model = HinglishClassifier()

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 5
print(f"Training for {num_epochs} epochs...")

for epoch in range(num_epochs):
    model.train()
    train_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    accuracy = correct / total

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")

# Create directory for saving model
os.makedirs('model', exist_ok=True)

# Save the model
model_path = 'model/hinglish_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

# Save the tokenizer
tokenizer_path = 'model/tokenizer'
tokenizer.save_pretrained(tokenizer_path)
print(f"Tokenizer saved to {tokenizer_path}")

# Create embeddings for all prompts for faster inference
print("Creating embeddings for all prompts...")

def get_embedding(text, tokenizer, model, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        outputs = model.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    return embedding

# Get embeddings for all prompts
prompt_embeddings = {}
for prompt in prompts:
    prompt_embeddings[prompt] = get_embedding(prompt, tokenizer, model, device)

# Save embeddings
np.savez_compressed('model/prompt_embeddings.npz', embeddings=prompt_embeddings)
print("Embeddings saved to model/prompt_embeddings.npz")

print("Fine-tuning completed successfully!")

Loading dataset...
Loaded 15 examples from dataset.jsonl
Training examples: 24
Validation examples: 6


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Using device: cuda
Training for 5 epochs...
Epoch 1/5, Train Loss: 0.7276, Val Loss: 0.7225, Accuracy: 0.5000
Epoch 2/5, Train Loss: 0.7030, Val Loss: 0.7163, Accuracy: 0.3333
Epoch 3/5, Train Loss: 0.6261, Val Loss: 0.7327, Accuracy: 0.3333
Epoch 4/5, Train Loss: 0.5887, Val Loss: 0.8000, Accuracy: 0.5000
Epoch 5/5, Train Loss: 0.4792, Val Loss: 1.0007, Accuracy: 0.5000
Model saved to model/hinglish_model.pt
Tokenizer saved to model/tokenizer
Creating embeddings for all prompts...
Embeddings saved to model/prompt_embeddings.npz
Fine-tuning completed successfully!


In [3]:
# inference.py - Script to test the fine-tuned model
import json
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_similarity

print("Loading model and data for inference...")

# Define the model class (same as in fine_tune.py)
class HinglishClassifier(nn.Module):
    def __init__(self):
        super(HinglishClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.classifier = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        logits = self.classifier(cls_output)
        return logits

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('model/tokenizer')

# Initialize the model
model = HinglishClassifier()

# Load the saved model weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load('model/hinglish_model.pt', map_location=device))
model.to(device)
model.eval()

# Load the prompt-completion dictionary
with open('prompt_completion_dict.json', 'r', encoding='utf-8') as f:
    prompt_completion_dict = json.load(f)

# Load the pre-computed embeddings
data = np.load('model/prompt_embeddings.npz', allow_pickle=True)
prompt_embeddings = data['embeddings'].item()

# Function to get embedding for a new query
def get_embedding(text, tokenizer, model, device):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        outputs = model.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    return embedding

# Function to find the most similar prompt
def find_most_similar_prompt(query, prompt_embeddings, tokenizer, model, device):
    query_embedding = get_embedding(query, tokenizer, model, device)

    similarities = {}
    for prompt, embedding in prompt_embeddings.items():
        similarity = cosine_similarity(query_embedding, embedding)[0][0]
        similarities[prompt] = similarity

    return max(similarities.items(), key=lambda x: x[1])[0]

# Function to generate a response
def generate_response(query, prompt_embeddings, prompt_completion_dict, tokenizer, model, device):
    most_similar_prompt = find_most_similar_prompt(query, prompt_embeddings, tokenizer, model, device)
    return prompt_completion_dict[most_similar_prompt]

# Test prompts
test_prompts = [
    "Kal ka plan kya hai?",
    "Kuch khaas news?",
    "Coffee peene chalein?"
]

# Generate and display responses
print("\nTesting the Hinglish assistant model with sample prompts:\n")
test_results = []

for prompt in test_prompts:
    print(f"User: {prompt}")
    response = generate_response(prompt, prompt_embeddings, prompt_completion_dict, tokenizer, model, device)
    print(f"Assistant: {response}")
    print()
    test_results.append({"prompt": prompt, "response": response})

# Save test results
with open('test_results.json', 'w', encoding='utf-8') as f:
    json.dump(test_results, f, ensure_ascii=False, indent=2)

print("Test results saved to test_results.json")

# Interactive mode
print("\nEnter 'quit' to exit")
while True:
    user_input = input("User: ")
    if user_input.lower() == 'quit':
        break

    response = generate_response(user_input, prompt_embeddings, prompt_completion_dict, tokenizer, model, device)
    print(f"Assistant: {response}")

print("Inference completed.")

Loading model and data for inference...

Testing the Hinglish assistant model with sample prompts:

User: Kal ka plan kya hai?
Assistant: Yahan aaj thoda cloudy hai but baarish nahi ho rahi. Temperature bhi pleasant hai, na zyada garmi na zyada thand.

User: Kuch khaas news?
Assistant: Aap pehle phone restart karke dekhiye. Agar problem persist kare toh unnecessary apps delete kar dijiye aur storage clear kijiye.

User: Coffee peene chalein?
Assistant: Maine aaj paneer butter masala aur roti banayi. Simple but tasty meal tha. Aapne kya khaya?

Test results saved to test_results.json

Enter 'quit' to exit
User: Kal ka plan kya hai?
Assistant: Yahan aaj thoda cloudy hai but baarish nahi ho rahi. Temperature bhi pleasant hai, na zyada garmi na zyada thand.
User: quit
Inference completed.
