# 🧬 Quark Brain Simulation - Kaggle DNA Classification & Consciousness Integration

This notebook trains the Quark consciousness agent using a DNA classification dataset from Kaggle.
It leverages Kaggle's free GPU resources and integrates with the main consciousness agent for enhanced biological modeling.

## Features:
- **DNA Sequence Analysis**: Trains models on genomic data to inform GRN and molecular components
- **GPU Acceleration**: Utilizes Kaggle's free Tesla T4/P100 GPUs
- **Consciousness Integration**: Connects DNA-level insights to the main consciousness agent
- **Biological Validation**: Enhances the biological plausibility of the simulation
- **End-to-End Pipeline**: From data loading to model training and consciousness correlation

---


In [None]:
# Setup and Environment Configuration
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from datetime import datetime
import json
import warnings

warnings.filterwarnings('ignore')

# Setup matplotlib
plt.style.use('seaborn-v0_8')
%matplotlib inline

print("🧬 Quark Brain Simulation - Kaggle DNA Classification Training")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")

# Configure paths for Kaggle environment
KAGGLE_INPUT = "/kaggle/input"
KAGGLE_WORKING = "/kaggle/working"
DATASET_PATH = "../database/kaggle_integration/datasets/dna-classification-dataset" 
SESSION_ID = f"kaggle_dna_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"


In [None]:
# DNA Data Loading and Preprocessing
class KaggleDNADataset:
    """Load and preprocess the DNA classification dataset"""
    
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.label_encoder = LabelEncoder()
        self.onehot_encoder = OneHotEncoder(sparse_output=False, categories='auto')

    def load_data(self):
        """Load DNA sequence data"""
        try:
            # Assuming the dataset is in a file like 'human.txt' or similar
            # Let's find the main data file. The downloaded file is `dna-classification-dataset/human.txt`
            data_file = os.path.join(self.dataset_path, "human.txt")
            if not os.path.exists(data_file):
                 # Fallback for different naming
                data_file = os.path.join(self.dataset_path, "dna_data.csv")
                if not os.path.exists(data_file):
                    print(f"Error: Could not find data file in {self.dataset_path}")
                    return None, None
            
            # This dataset seems to be space-separated values
            df = pd.read_csv(data_file, sep=' ', header=None, names=['sequence', 'label'])
            
            print(f"✅ Loaded {len(df)} DNA sequences.")
            print(df.head())
            return df
        except Exception as e:
            print(f"Error loading data: {e}")
            return None

    def preprocess(self, df):
        """Preprocess DNA sequences for model training"""
        print("🔬 Preprocessing DNA sequences...")
        
        # 1. Encode labels
        df['label_encoded'] = self.label_encoder.fit_transform(df['label'])
        
        # 2. One-hot encode DNA sequences
        # We need to treat each character in the sequence as a category
        sequences = df['sequence'].values
        max_len = max(len(s) for s in sequences)
        
        # Pad sequences to the same length
        padded_sequences = np.array([list(s.ljust(max_len, 'N')) for s in sequences]) # Pad with 'N' for neutral
        
        # One-hot encode the characters
        unique_chars = np.unique(padded_sequences)
        char_to_int = {char: i for i, char in enumerate(unique_chars)}
        int_sequences = np.vectorize(char_to_int.get)(padded_sequences)
        
        onehot_encoded = self.onehot_encoder.fit_transform(int_sequences)
        
        # Reshape for CNN input (samples, channels, height, width)
        # We'll treat sequence length as one dimension and nucleotide channels as another
        onehot_encoded = onehot_encoded.reshape(len(sequences), max_len, -1).transpose(0, 2, 1)

        X = torch.FloatTensor(onehot_encoded).to(self.device)
        y = torch.LongTensor(df['label_encoded'].values).to(self.device)
        
        print(f"✅ Preprocessing complete.")
        print(f"  - Input shape (X): {X.shape}")
        print(f"  - Target shape (y): {y.shape}")
        
        return X, y

# Load and preprocess data
dna_dataset = KaggleDNADataset(DATASET_PATH)
df_dna = dna_dataset.load_data()
if df_dna is not None:
    X_dna, y_dna = dna_dataset.preprocess(df_dna)


In [None]:
# DNA Classification Model (CNN-based)
class DNA_CNN(nn.Module):
    """Convolutional Neural Network for DNA sequence classification"""
    
    def __init__(self, input_channels, num_classes):
        super(DNA_CNN, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=32, kernel_size=5, padding=2)
        self.bn1 = nn.BatchNorm1d(32)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=5, padding=2)
        self.bn2 = nn.BatchNorm1d(64)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        
        self.flatten = nn.Flatten()
        
        # Placeholder for flattened size, will be calculated dynamically
        self.fc1 = nn.Linear(0, 128) # Size will be set in forward pass
        self.relu3 = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool1(self.relu1(self.bn1(self.conv1(x))))
        x = self.pool2(self.relu2(self.bn2(self.conv2(x))))
        
        x = self.flatten(x)
        
        # Dynamically create fc1 if not initialized
        if self.fc1.in_features == 0:
            self.fc1 = nn.Linear(x.shape[1], 128).to(x.device)
            
        x = self.relu3(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Initialize the model
if df_dna is not None:
    input_channels = X_dna.shape[1] 
    num_classes = len(np.unique(y_dna.cpu()))
    dna_model = DNA_CNN(input_channels, num_classes).to(dna_dataset.device)
    print("🧠 DNA Classification Model (CNN) Initialized")
    print(dna_model)


In [None]:
# Training Pipeline for DNA Model
class KaggleDNATrainer:
    """Trainer for the DNA Classification model on Kaggle"""

    def __init__(self, model, X, y):
        self.model = model
        self.device = next(model.parameters()).device
        self.X = X
        self.y = y
        self.metrics = []

        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y.cpu()
        )
        
        self.train_dataset = TensorDataset(self.X_train, self.y_train)
        self.test_dataset = TensorDataset(self.X_test, self.y_test)
        
        self.train_loader = DataLoader(self.train_dataset, batch_size=64, shuffle=True)
        self.test_loader = DataLoader(self.test_dataset, batch_size=64, shuffle=False)

        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)

    def train(self, epochs=25):
        print("🚀 Starting DNA model training...")
        for epoch in range(epochs):
            self.model.train()
            train_loss = 0.0
            for i, (sequences, labels) in enumerate(self.train_loader):
                self.optimizer.zero_grad()
                outputs = self.model(sequences)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()

            avg_train_loss = train_loss / len(self.train_loader)
            
            # Validation
            self.model.eval()
            test_loss = 0.0
            correct = 0
            total = 0
            with torch.no_grad():
                for sequences, labels in self.test_loader:
                    outputs = self.model(sequences)
                    loss = self.criterion(outputs, labels)
                    test_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()

            avg_test_loss = test_loss / len(self.test_loader)
            accuracy = 100 * correct / total
            
            self.metrics.append({
                'epoch': epoch,
                'train_loss': avg_train_loss,
                'test_loss': avg_test_loss,
                'accuracy': accuracy
            })

            print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Test Loss: {avg_test_loss:.4f}, Accuracy: {accuracy:.2f}%")

        print("🎉 Training finished!")
        return self.metrics

# Train the model
if df_dna is not None:
    dna_trainer = KaggleDNATrainer(dna_model, X_dna, y_dna)
    training_metrics = dna_trainer.train()


In [None]:
# Results Visualization and Consciousness Integration
def visualize_results(metrics):
    df_metrics = pd.DataFrame(metrics)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot loss
    ax1.plot(df_metrics['epoch'], df_metrics['train_loss'], label='Train Loss')
    ax1.plot(df_metrics['epoch'], df_metrics['test_loss'], label='Test Loss')
    ax1.set_title('Model Loss')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax1.legend()
    
    # Plot accuracy
    ax2.plot(df_metrics['epoch'], df_metrics['accuracy'], label='Accuracy', color='green')
    ax2.set_title('Model Accuracy')
    ax2.set_xlabel('Epochs')
    ax2.set_ylabel('Accuracy (%)')
    ax2.legend()
    
    plt.show()

def get_predictions(model, loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for sequences, labels in loader:
            outputs = model(sequences)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return all_labels, all_preds

if 'dna_trainer' in locals():
    visualize_results(training_metrics)
    
    # Classification Report and Confusion Matrix
    labels, preds = get_predictions(dna_trainer.model, dna_trainer.test_loader)
    print("\nClassification Report:")
    print(classification_report(labels, preds))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(labels, preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# --- Consciousness Integration ---
def integrate_with_consciousness_agent(dna_model_accuracy):
    """Simulate integrating the DNA model's performance with the consciousness agent."""
    
    print("\n--- 🧩 Integrating with Consciousness Agent ---")
    
    # This is a conceptual integration. In a real scenario, you'd use the agent connector.
    consciousness_metrics = {
        'source': 'KaggleDNAModel',
        'timestamp': datetime.now().isoformat(),
        'integration_type': 'genomic_foundation',
        'key_metric': 'dna_classification_accuracy',
        'value': dna_model_accuracy,
        'impact_on_consciousness': 'enhanced_biological_plausibility',
        'affected_modules': ['molecular_geneticist', 'developmental_neurobiologist']
    }
    
    print("Metrics to be sent to Consciousness Agent:")
    print(json.dumps(consciousness_metrics, indent=2))
    
    # Simulate saving this to a shared database/log
    log_path = os.path.join(KAGGLE_WORKING, 'consciousness_integration_log.json')
    with open(log_path, 'a') as f:
        f.write(json.dumps(consciousness_metrics) + '\n')
        
    print(f"\n✅ Integration metrics logged for consciousness agent.")
    print("The consciousness agent can now use this genomic insight to refine its simulation.")


if 'training_metrics' in locals():
    final_accuracy = training_metrics[-1]['accuracy']
    integrate_with_consciousness_agent(final_accuracy)
