# titanic_stage2



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = '3kgm8fdyzyjqqv'
os.environ['DataZoneDomainId'] = 'dzd-5slh6mlvrck50n'
os.environ['DataZoneEnvironmentId'] = '3h50yb0v8w0y3r'
os.environ['DataZoneDomainRegion'] = 'us-east-1'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "3kgm8fdyzyjqqv",
                "DataZoneDomainId": "dzd-5slh6mlvrck50n",
                "DataZoneEnvironmentId": "3h50yb0v8w0y3r",
                "DataZoneDomainRegion": "us-east-1",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

In [0]:
import sagemaker
import boto3

print("Compute: sc.t3.medium")
print("SageMaker version:", sagemaker.__version__)
print("Region:", boto3.Session().region_name)
print("Execution role:", sagemaker.get_execution_role())


sagemaker.config INFO - Fetched defaults config from location: /etc/xdg/sagemaker/config.yaml


sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Compute: sc.t3.medium
SageMaker version: 2.254.1
Region: us-east-1


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Execution role: arn:aws:iam::547868852858:role/service-role/AmazonSageMakerAdminIAMExecutionRole


In [0]:
import pandas as pd

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
df.info()
df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [0]:
bucket = sagemaker.Session().default_bucket()
key = "datasets/titanic.csv"
df.to_csv(f"s3://{bucket}/{key}", index=False)
print("Saved to:", f"s3://{bucket}/{key}")


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Saved to: s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/datasets/titanic.csv


In [0]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
import pandas as pd

# اختيار أعمدة بسيطة ومفيدة للتصنيف
cols = ["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
data = df[cols].copy()

# معالجة القيم الناقصة
data["Age"] = data["Age"].fillna(data["Age"].median())
data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0])

data.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [0]:
data = pd.get_dummies(data, columns=["Sex", "Embarked"], drop_first=True)
data.head()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,True,False,True
1,1,1,38.0,1,0,71.2833,False,False,False
2,1,3,26.0,0,0,7.925,False,False,True
3,1,1,35.0,1,0,53.1,False,False,True
4,0,3,35.0,0,0,8.05,True,False,True


In [0]:
from sklearn.model_selection import train_test_split

X = data.drop("Survived", axis=1)
y = data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


KeyboardInterrupt: 

In [0]:
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

print("Saved local CSV files.")


Saved local CSV files.


In [0]:
X_train.shape

(712, 8)

In [0]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Model training completed.")


Model training completed.


In [0]:
y_pred = model.predict(X_test)


In [0]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.8044692737430168

Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [0]:
import joblib

joblib.dump(model, "logistic_model.joblib")
print("Model saved locally.")


Model saved locally.


In [0]:
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()
model_key = "model-artifacts/logistic_model.joblib"

session.upload_data(
    path="logistic_model.joblib",
    bucket=bucket,
    key_prefix="model-artifacts"
)

print("Uploaded to:", f"s3://{bucket}/{model_key}")


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Uploaded to: s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/model-artifacts/logistic_model.joblib


In [0]:
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()

prefix = "titanic-sklearn"

train_s3 = session.upload_data("X_train.csv", bucket=bucket, key_prefix=f"{prefix}/data")
test_s3  = session.upload_data("X_test.csv",  bucket=bucket, key_prefix=f"{prefix}/data")
ytrain_s3 = session.upload_data("y_train.csv", bucket=bucket, key_prefix=f"{prefix}/data")
ytest_s3  = session.upload_data("y_test.csv",  bucket=bucket, key_prefix=f"{prefix}/data")

print("Train:", train_s3)
print("Test :", test_s3)
print("yTrain:", ytrain_s3)
print("yTest :", ytest_s3)


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Train: s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/titanic-sklearn/data/X_train.csv
Test : s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/titanic-sklearn/data/X_test.csv
yTrain: s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/titanic-sklearn/data/y_train.csv
yTest : s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/titanic-sklearn/data/y_test.csv


In [0]:
train_script = r"""
import os
import argparse
import glob
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def first_csv(path):
    files = glob.glob(os.path.join(path, "*.csv"))
    if not files:
        raise FileNotFoundError(f"No CSV files found in: {path}")
    return files[0]

def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--train-x-dir", type=str, default="/opt/ml/input/data/train_x")
    p.add_argument("--train-y-dir", type=str, default="/opt/ml/input/data/train_y")
    p.add_argument("--test-x-dir", type=str, default="/opt/ml/input/data/test_x")
    p.add_argument("--test-y-dir", type=str, default="/opt/ml/input/data/test_y")
    p.add_argument("--C", type=float, default=1.0)
    p.add_argument("--max-iter", type=int, default=1000)
    return p.parse_args()

if __name__ == "__main__":
    args = parse_args()

    x_train_path = first_csv(args.train_x_dir)
    y_train_path = first_csv(args.train_y_dir)
    x_test_path  = first_csv(args.test_x_dir)
    y_test_path  = first_csv(args.test_y_dir)

    print("Reading files:")
    print("X_train:", x_train_path)
    print("y_train:", y_train_path)
    print("X_test :", x_test_path)
    print("y_test :", y_test_path)

    X_train = pd.read_csv(x_train_path)
    y_train = pd.read_csv(y_train_path).squeeze()
    X_test  = pd.read_csv(x_test_path)
    y_test  = pd.read_csv(y_test_path).squeeze()

    model = LogisticRegression(C=args.C, max_iter=args.max_iter)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print("TEST_ACCURACY:", acc)

    model_dir = os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(model, os.path.join(model_dir, "model.joblib"))
"""
with open("train.py", "w", encoding="utf-8") as f:
    f.write(train_script)

print("Updated train.py (robust file loading).")



Updated train.py (robust file loading).


In [0]:
from sagemaker.sklearn.model import SKLearnModel
import sagemaker

session = sagemaker.Session()
role = sagemaker.get_execution_role()

model_artifact = desc["ModelArtifacts"]["S3ModelArtifacts"]

sk_model = SKLearnModel(
    model_data=model_artifact,
    role=role,
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=session
)

print("Model object created.")


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Model object created.


In [0]:
import boto3
from urllib.parse import urlparse

s3 = boto3.client("s3")

uri = "s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/titanic-sklearn/output/"
p = urlparse(uri)
bucket = p.netloc
prefix = p.path.lstrip("/")

resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, MaxKeys=50)

print("Bucket:", bucket)
print("Prefix:", prefix)
for obj in resp.get("Contents", []):
    print(obj["Key"])


Bucket: amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv
Prefix: titanic-sklearn/output/
titanic-sklearn/output/sagemaker-scikit-learn-2025-12-15-13-32-20-053/debug-output/training_job_end.ts
titanic-sklearn/output/sagemaker-scikit-learn-2025-12-15-13-32-20-053/profiler-output/framework/training_job_end.ts
titanic-sklearn/output/sagemaker-scikit-learn-2025-12-15-13-32-20-053/profiler-output/system/incremental/2025121513/1765805520.algo-1.json
titanic-sklearn/output/sagemaker-scikit-learn-2025-12-15-13-32-20-053/profiler-output/system/incremental/2025121513/1765805580.algo-1.json
titanic-sklearn/output/sagemaker-scikit-learn-2025-12-15-13-32-20-053/profiler-output/system/incremental/2025121513/1765805640.algo-1.json
titanic-sklearn/output/sagemaker-scikit-learn-2025-12-15-13-32-20-053/profiler-output/system/training_job_end.ts
titanic-sklearn/output/sagemaker-scikit-learn-2025-12-15-13-44-17-121/debug-output/training_job_end.ts
titanic-sklearn/output/sagemaker-scikit-learn-2025-12

In [0]:
import tarfile
import os

# تأكد أن الملف موجود
assert os.path.exists("logistic_model.joblib"), "logistic_model.joblib not found!"

artifact_name = "model.tar.gz"
with tarfile.open(artifact_name, "w:gz") as tar:
    tar.add("logistic_model.joblib", arcname="model.joblib")

print("Created:", artifact_name)


Created: model.tar.gz


In [0]:
import sagemaker

session = sagemaker.Session()
bucket = session.default_bucket()
model_prefix = "titanic-sklearn/manual-artifacts"
model_s3 = session.upload_data("model.tar.gz", bucket=bucket, key_prefix=model_prefix)

print("Uploaded model artifact to:", model_s3)


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


Uploaded model artifact to: s3://amazon-sagemaker-547868852858-us-east-1-3kgm8fdyzyjqqv/titanic-sklearn/manual-artifacts/model.tar.gz


In [0]:
from sagemaker.sklearn.model import SKLearnModel
import sagemaker

role = sagemaker.get_execution_role()

sk_model = SKLearnModel(
    model_data=model_s3,
    role=role,
    framework_version="1.2-1",
    py_version="py3",
    entry_point="inference.py",   # سننشئه الآن
    sagemaker_session=session
)
print("SKLearnModel ready.")


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3Bucket


sagemaker.config INFO - Applied value from config key = SageMaker.PythonSDK.Modules.Session.DefaultS3ObjectKeyPrefix


SKLearnModel ready.


In [0]:
predictor = sk_model.deploy(initial_instance_count=1, instance_type="ml.t2.medium")
print("Endpoint deployed.")


-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

-

!

Endpoint deployed.


In [0]:
predictor.delete_endpoint()
print("Endpoint deleted ✅")


Endpoint deleted ✅


## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()