# Section 1: Data Generation

In [1]:
import random
import pandas as pd

class BusinessDescriptionGenerator:
    """Generates realistic business descriptions for training"""
    def __init__(self):
        self.business_data = {
            "tech": {
                "types": ["app development", "software company", "startup", "SaaS platform"],
                "specialties": ["AI", "mobile apps", "web development", "automation"],
                "style": ["innovative", "cutting-edge", "fast-growing", "scalable"]
            },
            "food": {
                "types": ["coffee shop", "restaurant", "bakery", "food truck"],
                "specialties": ["organic", "vegan", "artisan", "farm-to-table"],
                "style": ["local", "family-owned", "cozy", "fresh"]
            },
            "health": {
                "types": ["yoga studio", "fitness center", "spa", "wellness clinic"],
                "specialties": ["holistic", "therapeutic", "relaxing", "healing"],
                "style": ["peaceful", "professional", "welcoming", "modern"]
            },
            "retail": {
                "types": ["boutique", "online store", "shop", "marketplace"],
                "specialties": ["handmade", "vintage", "designer", "sustainable"],
                "style": ["trendy", "affordable", "unique", "quality"]
            }
        }

        self.templates = [
            "{style} {type} specializing in {specialty}",
            "{specialty} {type} in the {style} district",
            "We run a {style} {type} focused on {specialty}",
            "Local {specialty} {type} with {style} approach",
            "{type} offering {specialty} services"
        ]

    def create_one_description(self, category=None):
        """Create one business description"""
        if not category:
            category = random.choice(list(self.business_data.keys()))
        data = self.business_data[category]
        business_type = random.choice(data["types"])
        specialty = random.choice(data["specialties"])
        style = random.choice(data["style"])
        template = random.choice(self.templates)
        description = template.format(
            type=business_type,
            specialty=specialty,
            style=style
        )

        return {
            "description": description,
            "category": category,
            "business_type": business_type,
            "specialty": specialty,
            "style": style
        }

    def create_training_data(self, total_samples=400):
        """Create training dataset"""
        all_data = []
        samples_per_category = total_samples // len(self.business_data)
        for category in self.business_data.keys():
            for _ in range(samples_per_category):
                sample = self.create_one_description(category)
                all_data.append(sample)
        return pd.DataFrame(all_data)

In [2]:
class DomainGenerator:
    """Generates domain suggestions from business descriptions"""
    def __init__(self):
        self.extensions = [".com", ".net", ".org", ".io"]

    def extract_keywords(self, business_data):
        """Extract keywords from business description parts"""
        keywords = []
        type_words = business_data["business_type"].replace(" ", "").lower()
        keywords.append(type_words)
        specialty = business_data["specialty"].lower()
        if " " not in specialty:
            keywords.append(specialty)
        style = business_data["style"].lower()
        if style in ["local", "smart", "quick", "best", "top"]:
            keywords.append(style)
        return keywords

    def create_domains(self, business_data):
        """Create domain suggestions"""
        keywords = self.extract_keywords(business_data)
        domains = []

        # Pattern 1: single keyword + extension
        if keywords:
            domain = keywords[0] + random.choice(self.extensions)
            domains.append(domain)

        # Pattern 2: combine two keywords
        if len(keywords) >= 2:
            domain = keywords[0] + keywords[1] + random.choice(self.extensions)
            domains.append(domain)

        # Pattern 3: keyword + "hub"/"pro"/"co"
        if keywords:
            suffix = random.choice(["hub", "pro", "co", "spot"])
            domain = keywords[0] + suffix + random.choice(self.extensions)
            domains.append(domain)

        return domains[:3]

In [3]:
import pandas as pd

def create_complete_training_data(num_samples=400):
    """Create complete training data with descriptions and domains"""
    print(f"Creating {num_samples} training samples...")

    desc_generator = BusinessDescriptionGenerator()
    business_df = desc_generator.create_training_data(num_samples)
    domain_generator = DomainGenerator()
    training_data = []
    for _, row in business_df.iterrows():
        business_data = {
            "business_type": row["business_type"],
            "specialty": row["specialty"],
            "style": row["style"]
        }

        domains = domain_generator.create_domains(business_data)
        training_text = f"Business: {row['description']} Domains: {', '.join(domains)}"
        training_data.append({
            "business_description": row["description"],
            "category": row["category"],
            "domains": domains,
            "training_text": training_text
        })

    training_df = pd.DataFrame(training_data)
    print(f"Created {len(training_df)} training samples")
    print(f"Categories: {training_df['category'].value_counts().to_dict()}")
    return training_df

In [4]:
if __name__ == "__main__":
    training_file_path = "../data/training_data.csv"
    generator = BusinessDescriptionGenerator()
    print("Creating Complete Training Data")
    training_data = create_complete_training_data(1000)
    print("\nTraining Examples:")
    for i in range(3):
        print(f"{i+1}. {training_data.iloc[i]['training_text']}")

    training_data.to_csv(training_file_path, index=False)
    print(f"\nSaved training data to training_data.csv")

Creating Complete Training Data
Creating 1000 training samples...
Created 1000 training samples
Categories: {'tech': 250, 'food': 250, 'health': 250, 'retail': 250}

Training Examples:
1. Business: automation software company in the innovative district Domains: softwarecompany.com, softwarecompanyautomation.io, softwarecompanypro.net
2. Business: startup offering automation services Domains: startup.org, startupautomation.org, startupspot.io
3. Business: We run a fast-growing SaaS platform focused on automation Domains: saasplatform.com, saasplatformautomation.org, saasplatformpro.io

Saved training data to training_data.csv


# Section 2: Model Training

In [5]:
import random

import pandas as pd
import torch

random.seed(42)

from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import Dataset

class DomainModel:
    """Fine-tuned model for domain generation"""
    def __init__(self, model_name="gpt2"):
        print(f"Loading {model_name}...")

        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else
            "mps" if torch.backends.mps.is_available() else
            "cpu"
        )
        print("Working on device: ", self.device)
        self.model.to(self.device)

        self.blocked_words = ["adult", "porn", "gambling", "casino", "betting"]

    def prepare_training_data(self, training_df):
        """Convert training data to format the model can use"""
        texts = training_df['training_text'].tolist()
        def tokenize_function(examples):
            return self.tokenizer(
                examples,
                truncation=True,
                padding=True,
                max_length=128,
                return_tensors="pt"
            )

        tokenized = tokenize_function(texts)

        dataset = Dataset.from_dict({
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': tokenized['input_ids']
        })

        print(f"Prepared {len(dataset)} training examples")
        return dataset

    def train_model(self, training_dataset, output_dir="../models/domain_model", epochs=3):
        """Train the model on domain generation task"""
        print(f"Starting training for {epochs} epochs...")

        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=2,
            save_steps=500,
            save_total_limit=2,
            logging_steps=100,
            remove_unused_columns=False,
            dataloader_drop_last=True,
            learning_rate=5e-5
        )

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=training_dataset,
            data_collator=data_collator,
        )

        trainer.train()

        trainer.save_model()
        self.tokenizer.save_pretrained(output_dir)

        print(f"Training complete! Model saved to {output_dir}")
        return output_dir

    def is_safe_input(self, text):
        """Check if input is safe to process"""
        text_lower = text.lower()
        for word in self.blocked_words:
            if word in text_lower:
                return False
        return True

    def generate_domains(self, business_description):
        """Generate domain suggestions for a business"""
        if not self.is_safe_input(business_description):
            return {
                "domains": [],
                "status": "blocked",
                "message": "Request contains inappropriate content"
            }

        if not business_description.strip():
            return {
                "domains": [],
                "status": "error",
                "message": "Business description cannot be empty"
            }

        try:
            prompt = f"Business: {business_description} Domains:"

            inputs = self.tokenizer(prompt, return_tensors="pt", padding=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_new_tokens=50,
                    temperature=0.7,
                    do_sample=True,
                    num_return_sequences=1,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            domains = self.extract_domains(generated_text)

            if domains:
                return {
                    "domains": domains,
                    "status": "success",
                    "message": f"Generated {len(domains)} domain suggestions"
                }
            else:
                return {
                    "domains": [],
                    "status": "error",
                    "message": "Could not generate valid domains"
                }

        except Exception as e:
            return {
                "domains": [],
                "status": "error",
                "message": f"Generation failed: {str(e)}"
            }

    def extract_domains(self, generated_text):
        """Extract domain names from generated text"""

        if "Domains:" in generated_text:
            domains_part = generated_text.split("Domains:")[-1].strip()
        else:
            domains_part = generated_text

        import re
        domain_pattern = r'([a-zA-Z0-9-]+\.(?:com|net|org|io|co))'
        domains = re.findall(domain_pattern, domains_part)

        clean_domains = []
        for domain in domains:
            clean_domain = domain.lower().strip()
            if (3 <= len(clean_domain.split('.')[0]) <= 25 and
                clean_domain not in clean_domains):
                clean_domains.append(clean_domain)

                if len(clean_domains) >= 3:
                    break

        return clean_domains

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def train_domain_model(training_data_file="../data/training_data.csv"):
    """Complete training pipeline"""

    print("Domain Model Training Pipeline\n")

    print("Loading training data...")
    training_df = pd.read_csv(training_data_file)
    print(f"Loaded {len(training_df)} training examples")

    print("\nInitializing model...")
    model = DomainModel("gpt2")

    print("\nPreparing training data...")
    training_dataset = model.prepare_training_data(training_df)

    print("\nTraining model...")
    model_path = model.train_model(training_dataset, epochs=3)

    print("\nTesting trained model...")

    test_cases = [
        "organic coffee shop downtown",
        "innovative AI startup",
        "local yoga studio"
    ]

    for test_case in test_cases:
        result = model.generate_domains(test_case)
        print(f"Business: {test_case}")
        print(f"Domains: {result['domains']}")
        print(f"Status: {result['status']}")
        print("---")

    return model, model_path

In [7]:
def load_trained_model(model_path="../models_old/domain_model"):
    """Load a previously trained model"""
    print(f"Loading trained model from {model_path}...")

    model = DomainModel()
    model.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model.model = GPT2LMHeadModel.from_pretrained(model_path)
    print("Model loaded successfully!")
    return model

In [8]:
if __name__ == "__main__":
    training_file_path = "../data/training_data.csv"
    model_output_path = "../models/domain_model"

    try:
        trained_model, model_path = train_domain_model(training_file_path)

        print("\nTraining Summary:")
        print(f"- Model type: GPT-2 fine-tuned")
        print(f"- Training examples: Check training_data.csv")
        print(f"- Model saved to: {model_path}")
        print(f"- Ready for evaluation!")

    except FileNotFoundError:
        print("training_data.csv not found!")
        print("Run the data generation script first to create training data.")

    except Exception as e:
        print(f"Training failed: {e}")
        print("Check your Python environment and GPU/CPU setup.")

Domain Model Training Pipeline

Loading training data...
Loaded 1000 training examples

Initializing model...
Loading gpt2...
Working on device:  mps

Preparing training data...
Prepared 1000 training examples

Training model...
Starting training for 3 epochs...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.3267
200,1.8907
300,1.4808
400,1.2605
500,1.048
600,0.8387
700,0.7804
800,0.7092
900,0.683
1000,0.6399




Training complete! Model saved to ../models/domain_model

Testing trained model...
Business: organic coffee shop downtown
Domains: ['coffeeshop.org', 'coffeeshophub.net', 'coffeeshopco.io']
Status: success
---
Business: innovative AI startup
Domains: ['startup.com', 'startupai.io', 'startuphub.io']
Status: success
---
Business: local yoga studio
Domains: ['yogastudio.net', 'yogastudiotherapeutic.io', 'yogastudioco.net']
Status: success
---

Training Summary:
- Model type: GPT-2 fine-tuned
- Training examples: Check training_data.csv
- Model saved to: ../models/domain_model
- Ready for evaluation!
