In [None]:
import sagemaker
import boto3
import pandas as pd
import os
from sagemaker import image_uris
from sagemaker.inputs import TrainingInput

# 1. Setup Environment
# -------------------
session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = session.boto_region_name
bucket_processed = 'sagewall-processed-zheng-1b'  # Your processed bucket
prefix = 'model-input'

print(f"Region: {region}")
print(f"Role: {role}")

# 2. Prepare Data (Split into Train/Validation)
# ---------------------------------------------
print("\nDownloading data from S3...")
s3 = boto3.client('s3')
# Download the clean file we made in Phase 2
s3.download_file(bucket_processed, 'KDDTrain+.txt', 'clean_data.csv')

# Read with Pandas (No headers, as per our Lambda format)
df = pd.read_csv('clean_data.csv', header=None)

# Split: 80% for Training, 20% for Validation (The "Exam")
train_data = df.sample(frac=0.8, random_state=42)
val_data = df.drop(train_data.index)

# Save locally without headers (SageMaker requirement)
train_data.to_csv('train.csv', index=False, header=False)
val_data.to_csv('validation.csv', index=False, header=False)

# Upload the split files back to S3
print("Uploading split datasets to S3...")
session.upload_data('train.csv', bucket=bucket_processed, key_prefix=prefix)
session.upload_data('validation.csv', bucket=bucket_processed, key_prefix=prefix)

s3_train_input = TrainingInput(s3_data=f's3://{bucket_processed}/{prefix}/train.csv', content_type='csv')
s3_val_input = TrainingInput(s3_data=f's3://{bucket_processed}/{prefix}/validation.csv', content_type='csv')
print("Data preparation complete.")

# 3. Define the XGBoost Model
# ---------------------------
# We retrieve the official XGBoost Docker image from AWS
container = image_uris.retrieve("xgboost", region, "1.5-1")

# Create the Estimator (The "Teacher")
xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',  # Standard training instance
    output_path=f's3://{bucket_processed}/model-output/',
    sagemaker_session=session
)

# Set Hyperparameters (The "Teaching Style")
xgb_estimator.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    objective='binary:logistic',  # "Is it an Attack? Yes/No"
    num_round=50
)

# 4. Start Training
# -----------------
print("\nStarting Training Job... (This takes ~3-5 minutes)")
xgb_estimator.fit({'train': s3_train_input, 'validation': s3_val_input})