In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns

# Load Titanic dataset from seaborn
df = sns.load_dataset('titanic')

# For simplicity, fill missing ages with median for these tasks
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

print("\n--- Question 5: Label Encoding vs One-Hot Encoding ---")
# Label Encoding for 'sex'
le = LabelEncoder()
sex_label_encoded = le.fit_transform(df['sex'])
print("Label Encoded 'sex':", sex_label_encoded[:5])

# One-Hot Encoding for 'sex'
ohe = OneHotEncoder(sparse=False, drop='if_binary')
sex_onehot_encoded = ohe.fit_transform(df[['sex']])
print("One-Hot Encoded 'sex':\n", sex_onehot_encoded[:5])

print("\n--- Question 6: Combining Feature Scaling Techniques ---")
features = df[['age', 'fare']].copy()

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
features_minmax = minmax_scaler.fit_transform(features)

# Standardization
standard_scaler = StandardScaler()
features_standard = standard_scaler.fit_transform(features)

print("Min-Max Scaled (first 5 rows):\n", features_minmax[:5])
print("Standardized (first 5 rows):\n", features_standard[:5])

print("\n--- Question 7: Handling Multiple Categorical Features ---")
# One-Hot Encoding 'sex' and 'embarked'
multi_ohe = OneHotEncoder(sparse=False, drop='first')
cat_features = df[['sex', 'embarked']]
cat_encoded = multi_ohe.fit_transform(cat_features)
print("One-Hot Encoded 'sex' & 'embarked' (first 5 rows):\n", cat_encoded[:5])

print("\n--- Question 8: Ordinal Encoding for Ranked Categories ---")
# 'pclass' as ordinal (1=1st class highest, 3=3rd class lowest)
ordinal_map = {1: 3, 2: 2, 3: 1}  # reversing to rank: 1st=3, 2nd=2, 3rd=1 (example)
df['pclass_ordinal'] = df['pclass'].map(ordinal_map)
print("Ordinal encoded 'pclass' (first 10):", df['pclass_ordinal'].head(10).tolist())

print("\n--- Question 9: Impact of Scaling on Different Algorithms ---")
# Prepare data
X = df[['age', 'fare']]
y = df['survived']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Decision Tree without scaling
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Decision Tree accuracy (no scaling):", accuracy_score(y_test, dt_pred))

# SVM without scaling
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print("SVM accuracy (no scaling):", accuracy_score(y_test, svm_pred))

# Scale features with StandardScaler for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm.fit(X_train_scaled, y_train)
svm_pred_scaled = svm.predict(X_test_scaled)
print("SVM accuracy (with scaling):", accuracy_score(y_test, svm_pred_scaled))

print("\n--- Question 10: Custom Transformations for High Cardinality Features ---")

class HighCardinalityEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, top_n=10):
        self.top_n = top_n
        self.top_categories_ = None
        
    def fit(self, X, y=None):
        self.top_categories_ = X.value_counts().nlargest(self.top_n).index
        return self
    
    def transform(self, X):
        X_transformed = X.apply(lambda x: x if x in self.top_categories_ else 'Other')
        return pd.get_dummies(X_transformed)

# Fix categorical dtype issue before filling NA for 'deck'
if pd.api.types.is_categorical_dtype(df['deck']):
    df['deck'] = df['deck'].cat.add_categories('Missing')

df['deck'] = df['deck'].fillna('Missing')

encoder = HighCardinalityEncoder(top_n=5)
encoder.fit(df['deck'])
encoded_deck = encoder.transform(df['deck'])

print(encoded_deck.head())



--- Question 5: Label Encoding vs One-Hot Encoding ---
Label Encoded 'sex': [1 0 0 0 1]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'