# Part1

# Task1

In [None]:
# 1. Age Group
df["age_group"] = pd.cut(
    df["age"],
    bins=[0, 12, 18, 35, 60, 100],
    labels=["Child", "Teen", "Adult", "Senior"]
)

# 2. Family Size
df["family_size"] = df["sibsp"] + df["parch"] + 1

# 3. Fare per Person
df["fare_per_person"] = df["fare"] / df["family_size"]

df[["age", "age_group", "family_size", "fare_per_person"]].head()


# Task2

In [None]:
# Convert to datetime
df["order_date"] = pd.to_datetime(df["order_date"])

# Extract year, month, day
df["order_year"] = df["order_date"].dt.year
df["order_month"] = df["order_date"].dt.month
df["order_day"] = df["order_date"].dt.day

# Text feature: word count
df["review_length"] = df["review_text"].astype(str).apply(len)
df["review_word_count"] = df["review_text"].astype(str).apply(lambda x: len(x.split()))

df[["order_date", "order_year", "order_month", "order_day",
    "review_length", "review_word_count"]].head()


# Part2

# Task3

In [None]:
cat_cols = df.select_dtypes(include="object").columns

df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)
df_encoded.head()


# Task4

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_cols = df.select_dtypes(include=["int64","float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(drop="first"), cat_cols)
    ]
)

df_transformed = preprocessor.fit_transform(df)
df_transformed


# Part3

# Task5

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[num_cols])

df_scaled[:5]


# Task6

In [None]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
df_minmax = minmax.fit_transform(df[num_cols])

df_minmax[:5]


# Part4

# Task7

In [None]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(drop="first"))
])

full_preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


# Task8

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = df.drop("survived", axis=1)
y = df["survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_pipeline = Pipeline([
    ("preprocessing", full_preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)


# Task9

In [None]:
num_cols = df.select_dtypes(include=["int64","float64"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore", drop="first"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = df.drop("survived", axis=1)
y = df["survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

full_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_test)

print("Pipeline Accuracy:", full_pipeline.score(X_test, y_test))
