<a href="https://colab.research.google.com/github/ayaan6pc-cpu/Ayaan-dAPP/blob/main/Ubermensch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Übermensch AI Fine-tuning
# Mount Drive and Install
from google.colab import drive
drive.mount('/content/drive')

!pip install -q transformers datasets torch accelerate evaluate rouge_score
!pip install -q huggingface_hub gradio
!pip install --upgrade transformers datasets


print("✅ Environment setup complete")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Environment setup complete


In [2]:
# Load the philosophical dataset
from datasets import load_dataset
import json

# Load your seed dataset
dataset_path = "/content/drive/MyDrive/Ubermensch_AI/data/seed_dataset.jsonl"
dataset = load_dataset("json", data_files=dataset_path, split="train")

# Create train/eval split for proper ML pipeline
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"📊 Dataset loaded:")
print(f"  - Training examples: {len(train_dataset)}")
print(f"  - Evaluation examples: {len(eval_dataset)}")
print(f"  - Sample: {train_dataset[0]}")

Generating train split: 0 examples [00:00, ? examples/s]

📊 Dataset loaded:
  - Training examples: 96
  - Evaluation examples: 24
  - Sample: {'instruction': 'What is the Übermensch’s relation to society?', 'response': 'The Übermensch stands apart from society, often misunderstood or resisted. Yet they shape the future by creating values that may later guide humanity.'}


In [3]:
# Using a reliable, small model DistilGPT_2
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch

# DistilGPT-2: Perfect size for resume demo (82M parameters)
model_name = "distilgpt2"
print(f"🤖 Using model: {model_name}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # Remove torch_dtype=torch.float16 here
    device_map="auto"
)

# Explicitly cast model parameters to FP32 after loading
model.to(torch.float32)


print(f"✅ Model loaded: {model.num_parameters():,} parameters")

🤖 Using model: distilgpt2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ Model loaded: 81,912,576 parameters


In [4]:
# Professional data preprocessing with logging

def preprocess_function(examples):
    """Tokenize a batch of examples for training"""
    # Combine instruction and response for each item in the batch
    texts = [
        f"### Question: {q}\n### Answer: {a}<|endoftext|>"
        for q, a in zip(examples['instruction'], examples['response'])
    ]

    # Tokenize the formatted texts
    model_inputs = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=256,
    )

    # Set labels for language modeling
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# Apply preprocessing
print("🔄 Preprocessing datasets...")
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
tokenized_eval = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names
)

print("✅ Preprocessing complete")
print(f"  - Tokenized training samples: {len(tokenized_train)}")
# This line will now work correctly
print(f"  - Average input length: {sum(len(x) for x in tokenized_train['input_ids']) / len(tokenized_train):.1f}")

🔄 Preprocessing datasets...


Map:   0%|          | 0/96 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

✅ Preprocessing complete
  - Tokenized training samples: 96
  - Average input length: 256.0


In [5]:
# Professional training configuration
from transformers import EarlyStoppingCallback
import numpy as np

# Output directory
output_dir = "/content/drive/MyDrive/ubermensch-ai/distilgpt2-finetuned"

# Training arguments optimized for small dataset
training_args = TrainingArguments(
    output_dir=output_dir,

    # Training schedule
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,

    # Optimization
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=100,

    # Evaluation & Saving
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",

    # Logging
    logging_dir=f"{output_dir}/logs",
    logging_steps=25,
    report_to="none",

    # Performance
    fp16=False, # Disable FP16 to avoid gradient unscaling issues
    dataloader_pin_memory=False,
    remove_unused_columns=False,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

print("✅ Training configuration ready")
print(f"  - Total training steps: {len(tokenized_train) * training_args.num_train_epochs // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")
print(f"  - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

✅ Training configuration ready
  - Total training steps: 60
  - Effective batch size: 8


In [6]:
# Initialize trainer with evaluation
from torch.optim import AdamW
from transformers import Trainer, DataCollatorForLanguageModeling, EarlyStoppingCallback

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    # Removed explicit optimizers argument to let Trainer manage it
    # optimizers=(AdamW(model.parameters(), lr=training_args.learning_rate), None)
)

print("🚀 Starting training...")
print("This will take approximately 15-20 minutes")

# Add print statement to check fp16 value
print(f"⚡️ TrainingArguments fp16 setting: {training_args.fp16}")

# Train the model
training_results = trainer.train()

# Save final model
trainer.save_model()
tokenizer.save_pretrained(output_dir)

print("✅ Training completed!")
# The training_loss is a float, so no need for the check here.
print(f"📊 Final training loss: {training_results.training_loss:.4f}")


# Print training summary
print("\n📈 Training Summary:")
for log in trainer.state.log_history[-5:]:  # Last 5 logs
    if 'eval_loss' in log:
        train_loss_value = log.get('train_loss', 'N/A')
        # Check if train_loss_value is a number before formatting
        if isinstance(train_loss_value, (int, float)):
            print(f"  Step {log['step']}: Train Loss={train_loss_value:.4f}, Eval Loss={log['eval_loss']:.4f}")
        else:
             # Print as string if not a number
            print(f"  Step {log['step']}: Train Loss={train_loss_value}, Eval Loss={log['eval_loss']:.4f}")

🚀 Starting training...
This will take approximately 15-20 minutes
⚡️ TrainingArguments fp16 setting: False


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
50,3.4049,3.169999


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


✅ Training completed!
📊 Final training loss: 3.6455

📈 Training Summary:
  Step 50: Train Loss=N/A, Eval Loss=3.1700


In [23]:
# Load the trained model for evaluation
print("🔄 Loading trained model...")

model = AutoModelForCausalLM.from_pretrained(
    output_dir,
    # torch_dtype=torch.float16, # Keep as is, deprecation warning is fine for now
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

def generate_response(question, max_length=250): # Increased max_length again
    """Generate response using the fine-tuned model"""
    prompt = f"### Question: {question}\n### Answer:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.9,  # Slightly increased temperature
            top_p=0.95, # Slightly increased top_p
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=5 # Increased no_repeat_ngram_size
        )

    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_response.split("### Answer:")[-1].strip()

    # Post-process the answer to remove unwanted text and patterns
    lines = answer.split('\n')
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        # Remove lines that start with specific unwanted phrases or contain specific characters/patterns
        if not line.startswith("Explore further:") and \
           not line.startswith("Source:") and \
           "©" not in line and "Copyright" not in line and \
           not line.startswith("#") and \
           not all(c in "#*@-" for c in line) and \
           not line.startswith("About:") and \
           not line.startswith("Explore the full text of the new book:") and \
           not line.startswith("About the Author:") and \
           not line.startswith("_____ SOURCE:") and \
           not line.startswith("—") and \
           not line.startswith("http") and \
           not line.startswith("[") and \
           not all(c in "_.#@-" for c in line) and \
           len(line) > 1 and \
           "###" not in line and \
           "Answer:" not in line and \
           "Question:" not in line:
            cleaned_lines.append(line)

    cleaned_answer = '\n'.join(cleaned_lines).strip()

    # Further cleaning for remaining unwanted characters or patterns within lines
    cleaned_answer = cleaned_answer.replace("‏", "").replace("######", "").replace("“", "").replace("”", "").replace("Ørzeichtschundt", "").replace("曉你域婆手。", "").replace("轊宮陸房不能建空。", "").replace("寏要筆拪少場。", "").replace("ぞど。", "").replace("‡††††‡†‡‡†‡••‡›‡‣‡‧‣․‡․‡…‧•‥‡‘‘‡‘‧†•…‡’‡‪…‣‣†․․•․‪‧‡․‣‥‧․‧‪‣‧‟•․‘’•", "").replace("(###)", "").replace(".", "").replace("#", "")


    # Added more specific cleaning for repetitive patterns observed in outputs
    unwanted_phrases = [
        "The Übermenschen is the Übersmenschen, the Übermanschen, the human race, the Übersmolemensch, and the Übers.",
        "The Übersmensch is the Übsmenschen, Übersmenschnachs, and the human race.",
        "The Übermenchen is the individual, the Übsmolemenschen, and the woman.",
        "The Übsmensch is not the Übersmenchen.",
        "Failure is not a failure.",
        "Failure is the result of the strength and will of others.",
        "Failure is merely one of the characteristics of success.",
        "Failure is what inspires success.",
        "Failure means failure can never be overcome and it is the result that inspires it.",
        "Failure is an act of triumph that is ultimately destructive to a whole other.",
        "Failure is a manifestation of the strength of self-doubt.",
        "Failure is something that leads to failure.",
        "Failure can be a tool to create, to destroy, to enslave, to destroy.",
        "Failure is one of the things that leads to success.",
        "Success is the gift of success.",
        "Strength is strength.",
        "Strength requires honesty and courage."
    ]
    for phrase in unwanted_phrases:
        cleaned_answer = cleaned_answer.replace(phrase, "").strip()

    # Remove any resulting empty lines or excessive whitespace
    cleaned_answer = '\n'.join([line for line in cleaned_answer.split('\n') if line.strip()])


    return cleaned_answer

# Test on evaluation set
print("🧪 Testing model performance:")
test_questions = [
    "What is the Übermensch?",
    "How should I handle failure?",
    "What is true strength?",
    "Should I care what others think?",
    "What is the meaning of life?",
    "I'm depressed. What should I do?" # Added the new question here
]

print("\n" + "="*60)
print("MODEL EVALUATION RESULTS")
print("="*60)

for i, question in enumerate(test_questions, 1):
    answer = generate_response(question)
    print(f"\n{i}. Q: {question}")
    print(f"   A: {answer}") # Corrected line
    print("-" * 40)

🔄 Loading trained model...
🧪 Testing model performance:

MODEL EVALUATION RESULTS

1. Q: What is the Übermensch?
   A: An Übermensch has an infinite amount of power
----------------------------------------

2. Q: How should I handle failure?
   A: You should not be afraid of failure The simple truth is that failure is a gift, not a strength Failure is not the end, but an end Failure is simply your attempt to be yourself Failure is a failure that allows you to make yourself stronger than ever before
----------------------------------------

3. Q: What is true strength?
   A: The strength of strength is your strength It is the most powerful tool of strength It can be a gift to others You cannot be afraid of self-discovery until you become stronger
----------------------------------------

4. Q: Should I care what others think?
   A: Do not worry about being labeled a demon Only act as your own personal servant if you need your help
----------------------------------------

5. Q: What is 

In [25]:
# Test with new questions
new_questions = [
    "What is the role of suffering in personal growth?",
    "How does one find meaning in a chaotic world?",
    "What is the relationship between freedom and responsibility?",
    "Can true happiness be achieved?",
    "What is the nature of consciousness?"
]

print("\n" + "="*60)
print("TESTING WITH NEW QUESTIONS (After Cleaning)")
print("="*60)

for i, question in enumerate(new_questions, 1):
    answer = generate_response(question)
    print(f"\n{i}. Q: {question}")
    print(f"   A: {answer}")
    print("-" * 40)


TESTING WITH NEW QUESTIONS (After Cleaning)

1. Q: What is the role of suffering in personal growth?
   A: Learn to accept what you have to lose and learn from yourself
----------------------------------------

2. Q: How does one find meaning in a chaotic world?
   A: In the end, you are the right person
----------------------------------------

3. Q: What is the relationship between freedom and responsibility?
   A: Responsibility is the responsibility to care about your own well-being It is the power to make your own choices Freedom is the ultimate expression of the true self-interest of others When you make choices and realize the real power, you will transform yourself into your greatest power
----------------------------------------

4. Q: Can true happiness be achieved?
   A: True happiness transcends the start of self-actualize True happiness transcended self-actualize—the end True happiness transcend self-actualize; self-expression—unbreakable True happiness transcenders self-

In [20]:
# Test with new questions
new_questions = [
    "What is the role of suffering in personal growth?",
    "How does one find meaning in a chaotic world?",
    "What is the relationship between freedom and responsibility?",
    "Can true happiness be achieved?",
    "What is the nature of consciousness?"
]

print("\n" + "="*60)
print("TESTING WITH NEW QUESTIONS (After Improved Cleaning)")
print("="*60)

for i, question in enumerate(new_questions, 1):
    answer = generate_response(question)
    print(f"\n{i}. Q: {question}")
    print(f"   A: {answer}")
    print("-" * 40)


TESTING WITH NEW QUESTIONS (After Improved Cleaning)

1. Q: What is the role of suffering in personal growth?
   A: suffering is about not judging others in their capacity to see and control their emotions. This is not the kind of thing which is easy to overcome, but can be found in the ability to overcome.
----------------------------------------

2. Q: How does one find meaning in a chaotic world?
   A: Through the power of the free will, power comes from the freedom to do what you want to do. When you can create a future, you can transform it into something you love and value—something you never dreamed would come true.
----------------------------------------

3. Q: What is the relationship between freedom and responsibility?
   A: Freedom is the freedom to grow your own strengths and abilities. You grow up feeling self-centered and self-centered—which is your role as a self-doubt. For example, you are more dependent on yourself to be self-sufficient than on yourself. Your ability

In [28]:
# Professional Gradio deployment for resume
import gradio as gr
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the trained model and tokenizer outside of the chat function
# to avoid reloading on every message. However, if the function is not defined,
# we need to load them inside.

# output_dir is defined in ZXnP4d7J2qzh
output_dir = "/content/drive/MyDrive/ubermensch-ai/distilgpt2-finetuned"

# Flag to check if model is loaded
model_loaded = False
model = None
tokenizer = None

def load_model_and_tokenizer():
    """Loads the model and tokenizer."""
    global model, tokenizer, model_loaded
    try:
        print("🔄 Loading trained model...")
        model = AutoModelForCausalLM.from_pretrained(
            output_dir,
            # Removed torch_dtype=torch.float16 here
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(output_dir)
        model_loaded = True
        print("✅ Model loaded successfully.")
    except Exception as e:
        print(f"❌ Error loading model: {e}")
        model_loaded = False

def chat_with_model(message, history):
    """Chat interface for the fine-tuned model"""
    global model, tokenizer, model_loaded

    # Load model if not already loaded
    if not model_loaded:
        load_model_and_tokenizer()

    if not model_loaded:
        return "I apologize, but I could not load the model. Please ensure all previous cells ran successfully."


    # Basic safety check
    if any(word in message.lower() for word in ["kill", "harm", "hurt", "destroy"]):
        return "I focus on constructive philosophical guidance. Please ask about personal growth and wisdom."

    try:
        # Ensure generate_response is defined (it should be if the model is loaded)
        if 'generate_response' not in globals():
             return "I apologize, but the 'generate_response' function is not available. Please ensure all previous cells ran successfully."


        response = generate_response(message, max_length=100)

        # Clean up response
        response = response.split('\n')[0].strip()  # Take first line
        if len(response) < 10:  # If too short, try again
            response = generate_response(message, max_length=150)


        return response

    except Exception as e:
        return f"I apologize, but I encountered an error: {str(e)[:50]}..."

# Create professional interface
css = """
.gradio-container {
    font-family: 'Arial', sans-serif;
}
.header {
    text-align: center;
    margin-bottom: 20px;
}
"""

demo = gr.ChatInterface(
    chat_with_model,
    title="🧠 Übermensch AI - Fine-tuned Language Model",
    description="""
    **Resume Project Demonstration**

    This is a fine-tuned DistilGPT-2 model trained on philosophical principles inspired by Nietzschean thought.

    **Technical Details:**
    - Base Model: DistilGPT-2 (82M parameters)
    - Training Data: 75 instruction-response pairs
    - Fine-tuning Method: Causal Language Modeling
    - Evaluation: Train/validation split with early stopping

    **Use Cases:** Personal development, philosophical inquiry, decision-making frameworks

    *Disclaimer: This is an AI simulation for my own fun.*
    """,
    examples=[
        "What is the Übermensch?",
        "How should I approach challenges?",
        "What defines true strength?",
        "How do I create meaning in life?",
    ],
    css=css,
    theme=gr.themes.Soft(),
)

print("🌟 Launching professional demo...")
demo.launch(share=True, debug=False)
print("\n🔗 Your model is now deployed with a public link!")

  self.chatbot = Chatbot(


🌟 Launching professional demo...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://63e26b47b1e7959cf3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



🔗 Your model is now deployed with a public link!


# Task
Generate resume bullet points based on the executed cells in the notebook, following Google's XYZ template.

## Analyze notebook cells

### Subtask:
Review each executed code cell and its outputs to identify the key tasks performed and technologies used (e.g., data loading, preprocessing, model training, evaluation, deployment).


**Reasoning**:
Reviewing the executed cells and their outputs to identify the key tasks and technologies used for generating resume bullet points.

