# Advanced AI Career Bot - Multi-Source Training Pipeline
## Train on Multiple Datasets with Optimized Model

This notebook trains a career recommendation model using multiple data sources:
- Career Q&A Dataset
- Skills Mapping Dataset
- Job Descriptions Dataset
- Books Recommendation Dataset
- roadmap.sh Career Paths

**Features:**
- Multi-source data integration
- Lightweight model (~80MB)
- Fast inference (<100ms on CPU)
- Mixed precision training
- Ready for Streamlit deployment

## 1. Setup and Installation

In [None]:
# Install required packages
!pip install -q torch transformers sentence-transformers pandas numpy scikit-learn tqdm accelerate

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import json
import pickle
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

## 2. Upload Your Datasets

Upload the following CSV files:
- `career_qa.csv` - Career questions and answers (columns: role, question, answer)
- `skills_mapping.csv` - Skills to career mapping (columns: role, skills, description)
- `job_descriptions.csv` - Job descriptions (columns: role, description, requirements)
- `books_recommendations.csv` - Career books (columns: role, book_title, author, description)

All files are optional - the system will work with any combination!

In [None]:
from google.colab import files

print("üì§ Upload your dataset files (or skip if using defaults)...")
uploaded = files.upload()

# List uploaded files
print("\n‚úÖ Uploaded files:")
for filename in uploaded.keys():
    print(f"  - {filename}")

## 3. Load and Merge Multiple Data Sources

In [None]:
def load_dataset(filename, default_data=None):
    """Load dataset or use default if not found"""
    try:
        df = pd.read_csv(filename)
        print(f"‚úÖ Loaded {filename}: {len(df)} rows")
        return df
    except FileNotFoundError:
        if default_data:
            print(f"‚ö†Ô∏è  {filename} not found, using default data")
            return pd.DataFrame(default_data)
        else:
            print(f"‚ö†Ô∏è  {filename} not found, skipping")
            return pd.DataFrame()

# Default sample data
default_career_qa = [
    {
        "role": "Data Scientist",
        "question": "What does a Data Scientist do?",
        "answer": "A Data Scientist extracts meaningful insights from large datasets using statistical analysis, machine learning, and data visualization techniques."
    },
    {
        "role": "Software Engineer",
        "question": "What skills are needed for Software Engineering?",
        "answer": "Software Engineers need programming skills, problem-solving abilities, knowledge of algorithms and data structures, and experience with software development tools."
    },
    {
        "role": "DevOps Engineer",
        "question": "What is DevOps?",
        "answer": "DevOps Engineers automate and streamline software development and deployment processes, managing infrastructure, CI/CD pipelines, and cloud resources."
    },
    {
        "role": "Frontend Developer",
        "question": "What technologies do Frontend Developers use?",
        "answer": "Frontend Developers use HTML, CSS, JavaScript, and modern frameworks like React, Vue, or Angular to build user interfaces."
    },
    {
        "role": "Backend Developer",
        "question": "What does a Backend Developer do?",
        "answer": "Backend Developers build server-side applications, APIs, databases, and ensure the logic and integration of applications work smoothly."
    }
]

# Load datasets
df_career_qa = load_dataset('career_qa.csv', default_career_qa)
df_skills = load_dataset('skills_mapping.csv')
df_jobs = load_dataset('job_descriptions.csv')
df_books = load_dataset('books_recommendations.csv')

print(f"\nüìä Total datasets loaded: {sum([not df.empty for df in [df_career_qa, df_skills, df_jobs, df_books]])}")

In [None]:
def merge_datasets(df_qa, df_skills, df_jobs, df_books):
    """Merge multiple data sources into unified training data"""
    merged_data = []
    
    # Process Career Q&A
    if not df_qa.empty:
        for _, row in df_qa.iterrows():
            text = f"{row.get('question', '')} {row.get('answer', '')}"
            merged_data.append({
                'role': row['role'],
                'text': text.strip(),
                'source': 'qa'
            })
    
    # Process Skills Mapping
    if not df_skills.empty and 'skills' in df_skills.columns:
        for _, row in df_skills.iterrows():
            text = f"Skills: {row.get('skills', '')} {row.get('description', '')}"
            merged_data.append({
                'role': row['role'],
                'text': text.strip(),
                'source': 'skills'
            })
    
    # Process Job Descriptions
    if not df_jobs.empty and 'description' in df_jobs.columns:
        for _, row in df_jobs.iterrows():
            text = f"{row.get('description', '')} Requirements: {row.get('requirements', '')}"
            merged_data.append({
                'role': row['role'],
                'text': text.strip(),
                'source': 'jobs'
            })
    
    # Process Books Recommendations
    if not df_books.empty and 'book_title' in df_books.columns:
        for _, row in df_books.iterrows():
            text = f"Recommended book: {row.get('book_title', '')} by {row.get('author', '')}. {row.get('description', '')}"
            merged_data.append({
                'role': row['role'],
                'text': text.strip(),
                'source': 'books'
            })
    
    df_merged = pd.DataFrame(merged_data)
    
    # Remove duplicates and clean
    df_merged = df_merged.drop_duplicates(subset=['role', 'text'])
    df_merged = df_merged[df_merged['text'].str.len() > 10]  # Remove very short texts
    
    print(f"\n‚úÖ Merged dataset created:")
    print(f"  Total samples: {len(df_merged)}")
    print(f"  Unique roles: {df_merged['role'].nunique()}")
    print(f"\n  Samples per source:")
    print(df_merged['source'].value_counts())
    print(f"\n  Top 5 roles:")
    print(df_merged['role'].value_counts().head())
    
    return df_merged

# Merge all datasets
df_merged = merge_datasets(df_career_qa, df_skills, df_jobs, df_books)
df_merged.head()

## 4. Prepare Training Data

In [None]:
# Encode labels
label_encoder = LabelEncoder()
df_merged['label'] = label_encoder.fit_transform(df_merged['role'])

# Split dataset
train_df, val_df = train_test_split(
    df_merged, 
    test_size=0.15, 
    stratify=df_merged['label'],
    random_state=42
)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Number of career categories: {len(label_encoder.classes_)}")

## 5. Define Model Architecture

In [None]:
class CareerDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class CareerClassifier(nn.Module):
    def __init__(self, base_model_name, num_classes, hidden_dim=256, dropout=0.3):
        super(CareerClassifier, self).__init__()
        self.base_model = AutoModel.from_pretrained(base_model_name)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Sequential(
            nn.Linear(self.base_model.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

## 6. Training Configuration

In [None]:
# Configuration
CONFIG = {
    'base_model': 'sentence-transformers/all-MiniLM-L6-v2',  # Lightweight, 80MB
    'max_length': 128,
    'hidden_dim': 256,
    'dropout': 0.3,
    'batch_size': 32,
    'learning_rate': 2e-5,
    'num_epochs': 10,
    'warmup_steps': 100,
    'weight_decay': 0.01,
    'gradient_accumulation_steps': 2
}

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(CONFIG['base_model'])
model = CareerClassifier(
    base_model_name=CONFIG['base_model'],
    num_classes=len(label_encoder.classes_),
    hidden_dim=CONFIG['hidden_dim'],
    dropout=CONFIG['dropout']
).to(device)

print(f"\nModel Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# Create datasets and dataloaders
train_dataset = CareerDataset(
    train_df['text'].values,
    train_df['label'].values,
    tokenizer,
    CONFIG['max_length']
)

val_dataset = CareerDataset(
    val_df['text'].values,
    val_df['label'].values,
    tokenizer,
    CONFIG['max_length']
)

train_loader = DataLoader(
    train_dataset,
    batch_size=CONFIG['batch_size'],
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG['batch_size'],
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

## 7. Training Loop

In [None]:
# Optimizer and scheduler
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay']
)

total_steps = len(train_loader) * CONFIG['num_epochs'] // CONFIG['gradient_accumulation_steps']
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=CONFIG['warmup_steps'],
    num_training_steps=total_steps
)

criterion = nn.CrossEntropyLoss()

# Training function
def train_epoch(model, dataloader, optimizer, scheduler, device, gradient_accumulation_steps):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    optimizer.zero_grad()
    
    progress_bar = tqdm(dataloader, desc='Training')
    for i, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss = loss / gradient_accumulation_steps
        loss.backward()
        
        if (i + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * gradient_accumulation_steps
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        progress_bar.set_postfix({
            'loss': total_loss / (i + 1),
            'acc': 100. * correct / total
        })
    
    return total_loss / len(dataloader), 100. * correct / total

# Validation function
def validate(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    return total_loss / len(dataloader), 100. * correct / total

In [None]:
# Training loop
best_val_acc = 0
patience = 3
patience_counter = 0

history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

print("üöÄ Starting training...\n")

for epoch in range(CONFIG['num_epochs']):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch + 1}/{CONFIG['num_epochs']}")
    print(f"{'='*60}")
    
    train_loss, train_acc = train_epoch(
        model, train_loader, optimizer, scheduler, device,
        CONFIG['gradient_accumulation_steps']
    )
    
    val_loss, val_acc = validate(model, val_loader, device)
    
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    print(f"\nüìä Results:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"  Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
    
    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        print(f"  ‚úÖ New best validation accuracy: {best_val_acc:.2f}%")
    else:
        patience_counter += 1
        print(f"  ‚ö†Ô∏è  No improvement. Patience: {patience_counter}/{patience}")
        
        if patience_counter >= patience:
            print(f"\n‚èπÔ∏è  Early stopping triggered!")
            break

print(f"\n{'='*60}")
print(f"‚úÖ Training completed!")
print(f"üìà Best validation accuracy: {best_val_acc:.2f}%")
print(f"{'='*60}")

## 8. Generate Pre-computed Embeddings

In [None]:
# Generate embeddings for faster inference
print("üì¶ Generating pre-computed embeddings...")

sentence_model = SentenceTransformer(CONFIG['base_model'])

career_texts = []
for role in label_encoder.classes_:
    role_texts = df_merged[df_merged['role'] == role]['text'].values
    combined_text = " ".join(role_texts[:5])  # Use top 5 texts per role
    career_texts.append(combined_text)

career_embeddings = sentence_model.encode(career_texts, show_progress_bar=True)

print(f"‚úÖ Generated {len(career_embeddings)} career embeddings")
print(f"   Embedding shape: {career_embeddings.shape}")

## 9. Save Model and Artifacts

In [None]:
import os
import zipfile

# Create models directory
os.makedirs('trained_models', exist_ok=True)

# Save full model (GPU)
print("üíæ Saving full model...")
torch.save({
    'model_state_dict': model.state_dict(),
    'config': CONFIG,
    'label_encoder': label_encoder,
    'history': history
}, 'trained_models/career_model.pth')

# Save CPU-optimized model
print("üíæ Saving CPU-optimized model...")
model_cpu = model.cpu()
torch.save({
    'model_state_dict': model_cpu.state_dict(),
    'config': CONFIG,
    'label_encoder': label_encoder,
    'history': history
}, 'trained_models/career_model_cpu.pth')

# Save label encoder
print("üíæ Saving label encoder...")
with open('trained_models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save embeddings
print("üíæ Saving career embeddings...")
np.save('trained_models/career_embeddings.npy', career_embeddings)

# Save metadata
print("üíæ Saving metadata...")
metadata = {
    'model_config': CONFIG,
    'num_classes': len(label_encoder.classes_),
    'classes': label_encoder.classes_.tolist(),
    'best_val_acc': best_val_acc,
    'training_samples': len(train_df),
    'validation_samples': len(val_df),
    'data_sources': df_merged['source'].value_counts().to_dict()
}

with open('trained_models/model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

# Create ZIP file
print("üì¶ Creating ZIP archive...")
with zipfile.ZipFile('trained_models.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk('trained_models'):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, 'trained_models'))

print("\n‚úÖ All artifacts saved successfully!")
print("\nüìÅ Saved files:")
for file in os.listdir('trained_models'):
    size = os.path.getsize(f'trained_models/{file}') / (1024 * 1024)
    print(f"  - {file}: {size:.2f} MB")

## 10. Download Trained Models

In [None]:
from google.colab import files

print("üì• Downloading trained models...")
files.download('trained_models.zip')

print("\n‚úÖ Download complete!")
print("\nüìã Next steps:")
print("  1. Extract 'trained_models.zip'")
print("  2. Move extracted files to 'streamlit_app/models/' directory")
print("  3. Run the Streamlit app: streamlit run app.py")

## 11. Test Inference

In [None]:
# Test the model
model.eval()

test_queries = [
    "I want to work with data and machine learning",
    "I enjoy building websites and user interfaces",
    "I like automating deployment processes",
    "I want to develop backend APIs and databases"
]

print("üß™ Testing model predictions:\n")

for query in test_queries:
    encoding = tokenizer(
        query,
        add_special_tokens=True,
        max_length=CONFIG['max_length'],
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)
    
    with torch.no_grad():
        outputs = model(encoding['input_ids'], encoding['attention_mask'])
        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        top_k = torch.topk(probabilities, k=3)
    
    print(f"Query: '{query}'")
    print("Top 3 predictions:")
    for i, (prob, idx) in enumerate(zip(top_k.values[0], top_k.indices[0])):
        career = label_encoder.inverse_transform([idx.item()])[0]
        print(f"  {i+1}. {career}: {prob.item()*100:.2f}%")
    print()

print("‚úÖ Model is ready for deployment!")