In [1]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.



import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
try:
    df = pd.read_csv('titanic.csv')
except FileNotFoundError:
    print("titanic.csv not found. Please download it from Kaggle or another source.")
    # Using a sample dataset structure if titanic.csv is not available for demonstration
    data = {'PassengerId': range(1, 11),
            'Survived': [0, 1, 1, 1, 0, 0, 0, 0, 1, 1],
            'Pclass': [3, 1, 3, 1, 3, 3, 1, 3, 2, 3],
            'Name': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
            'Sex': ['male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'male'],
            'Age': [22, 38, 26, 35, 35, np.nan, 54, 2, 27, 14],
            'SibSp': [1, 1, 0, 1, 0, 0, 0, 3, 1, 0],
            'Parch': [0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
            'Ticket': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
            'Fare': [7.25, 71.28, 7.92, 53.1, 8.05, 8.45, 51.86, 21.07, 24.0, 7.85],
            'Cabin': [np.nan, 'C85', np.nan, 'C123', np.nan, np.nan, 'E46', np.nan, np.nan, np.nan],
            'Embarked': ['S', 'C', 'S', 'S', 'S', 'Q', 'S', 'S', 'S', 'S']}
    df = pd.DataFrame(data)


print("Original DataFrame:")
print(df.head())
print("-" * 30)


# Question 5: Label Encoding vs One-Hot Encoding for 'Sex'
print("Question 5: Label Encoding vs One-Hot Encoding for 'Sex'")

# Label Encoding
label_encoder = LabelEncoder()
df['Sex_LabelEncoded'] = label_encoder.fit_transform(df['Sex'])
print("\nDataFrame with Label Encoded 'Sex':")
print(df[['Sex', 'Sex_LabelEncoded']].head())

# One-Hot Encoding (using pandas get_dummies)
df_onehot = pd.get_dummies(df, columns=['Sex'], prefix='Sex_OneHot', drop_first=True) # drop_first=True to avoid multicollinearity
print("\nDataFrame with One-Hot Encoded 'Sex':")
print(df_onehot[['Sex_OneHot_male']].head()) # Note: 'female' would be 0 when 'male' is 1 and vice versa

# Clean up added columns for the next questions
df = df.drop(columns=['Sex_LabelEncoded'])
df_onehot = df_onehot.drop(columns=['Sex_OneHot_male']) # Revert one-hot for subsequent steps

print("-" * 30)


# Question 6: Combining Feature Scaling Techniques
print("Question 6: Combining Feature Scaling Techniques")

# Use the 'Fare' column for demonstration
# Handle potential missing values in 'Fare' if any (though 'Fare' usually has very few)
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Reshape for scikit-learn scalers
fare = df[['Fare']]

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
df['Fare_MinMaxScaled'] = minmax_scaler.fit_transform(fare)
print("\nOriginal Fare vs Min-Max Scaled Fare:")
print(df[['Fare', 'Fare_MinMaxScaled']].head())

# Standardization (Z-score scaling)
standard_scaler = StandardScaler()
df['Fare_StandardScaled'] = standard_scaler.fit_transform(fare)
print("\nOriginal Fare vs Standard Scaled Fare:")
print(df[['Fare', 'Fare_StandardScaled']].head())

print("\nExplanation: Min-Max scaling scales data to a fixed range (usually [0, 1]), while Standardization scales data to have a mean of 0 and a standard deviation of 1. The outputs show the different distributions of the scaled values compared to the original.")

# Clean up added columns
df = df.drop(columns=['Fare_MinMaxScaled', 'Fare_StandardScaled'])

print("-" * 30)


# Question 7: Handling Multiple Categorical Features
print("Question 7: Handling Multiple Categorical Features ('Sex', 'Embarked')")

# Handle missing values in 'Embarked'
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Select categorical columns
categorical_cols = ['Sex', 'Embarked']

# Use ColumnTransformer for One-Hot Encoding multiple columns
# remainder='passthrough' keeps other columns
# sparse_output=False returns a dense numpy array
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

ct = ColumnTransformer(
    transformers=[('onehot', onehot_encoder, categorical_cols)],
    remainder='passthrough'
)

# Fit and transform the data
df_encoded_multiple = ct.fit_transform(df)

# Get the feature names after encoding
# This requires accessing the encoder within the ColumnTransformer
# For sparse_output=False, it's easier to convert back to DataFrame for readability
# Create a list of new column names
onehot_feature_names = ct.named_transformers_['onehot'].get_feature_names_in(categorical_cols)
new_column_names = list(onehot_feature_names) + [col for col in df.columns if col not in categorical_cols]

df_encoded_multiple = pd.DataFrame(df_encoded_multiple, columns=new_column_names, index=df.index)

print("\nDataFrame with multiple categorical features ('Sex', 'Embarked') One-Hot Encoded:")
# Display relevant columns
encoded_cols_display = [col for col in df_encoded_multiple.columns if col.startswith('Sex_') or col.startswith('Embarked_')]
print(df_encoded_multiple[encoded_cols_display].head())

print("-" * 30)


# Question 8: Ordinal Encoding for Ranked Categories ('Pclass')
print("Question 8: Ordinal Encoding for Ranked Categories ('Pclass')")

# 'Pclass' is 1st, 2nd, 3rd. We can assume 1st > 2nd > 3rd.
# Define the order of categories explicitly
pclass_order = [[1, 2, 3]] # Pclass values as a list within a list for the encoder

ordinal_encoder = OrdinalEncoder(categories=pclass_order)

# Reshape for scikit-learn encoder
pclass = df[['Pclass']]

df['Pclass_OrdinalEncoded'] = ordinal_encoder.fit_transform(pclass)

print("\nOriginal Pclass vs Ordinal Encoded Pclass:")
print(df[['Pclass', 'Pclass_OrdinalEncoded']].head())

print("\nExplanation: Pclass values (1, 2, 3) are encoded preserving their rank, where 1 < 2 < 3 in the encoded output.")

# Clean up added column
df = df.drop(columns=['Pclass_OrdinalEncoded'])

print("-" * 30)


# Question 9: Impact of Scaling on Different Algorithms
print("Question 9: Impact of Scaling on Different Algorithms (Decision Tree vs SVM)")

# Select features and target
# Using numerical features: 'Age', 'Fare', 'SibSp', 'Parch'
# Using categorical features: 'Sex', 'Embarked', 'Pclass'
# Target: 'Survived'

features = ['Age', 'Fare', 'SibSp', 'Parch', 'Sex', 'Embarked', 'Pclass']
target = 'Survived'

X = df[features].copy()
y = df[target].copy()

# Handle missing values
X['Age'] = X['Age'].fillna(X['Age'].median())
X['Fare'] = X['Fare'].fillna(X['Fare'].median())
X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0]) # Already done, but good practice

# Define preprocessing steps for numerical and categorical features
numerical_features = ['Age', 'Fare', 'SibSp', 'Parch']
categorical_features = ['Sex', 'Embarked', 'Pclass']

# Create preprocessors
# Preprocessor for unscaled numerical data
preprocessor_unscaled = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features), # Keep numerical data as is
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Preprocessor for scaled numerical data
preprocessor_scaled = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features), # Scale numerical data
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train models ---

# Decision Tree on Unscaled Data
pipeline_dt_unscaled = Pipeline(steps=[('preprocessor', preprocessor_unscaled),
                                       ('classifier', DecisionTreeClassifier(random_state=42))])

pipeline_dt_unscaled.fit(X_train, y_train)
y_pred_dt_unscaled = pipeline_dt_unscaled.predict(X_test)
accuracy_dt_unscaled = accuracy_score(y_test, y_pred_dt_unscaled)
print(f"\nAccuracy (Decision Tree, Unscaled Numerical Features): {accuracy_dt_unscaled:.4f}")

# Decision Tree on Scaled Data
pipeline_dt_scaled = Pipeline(steps=[('preprocessor', preprocessor_scaled),
                                     ('classifier', DecisionTreeClassifier(random_state=42))])

pipeline_dt_scaled.fit(X_train, y_train)
y_pred_dt_scaled = pipeline_dt_scaled.predict(X_test)
accuracy_dt_scaled = accuracy_score(y_test, y_pred_dt_scaled)
print(f"Accuracy (Decision Tree, Scaled Numerical Features): {accuracy_dt_scaled:.4f}")

# SVM on Unscaled Data
# SVM is sensitive to the scale of features, so expect poor performance without scaling
# Use probability=True for predict_proba if needed, but not required for basic accuracy
pipeline_svm_unscaled = Pipeline(steps=[('preprocessor', preprocessor_unscaled),
                                        ('classifier', SVC(gamma='auto', random_state=42))])

# Note: SVM can take longer to train
try:
    pipeline_svm_unscaled.fit(X_train, y_train)
    y_pred_svm_unscaled = pipeline_svm_unscaled.predict(X_test)
    accuracy_svm_unscaled = accuracy_score(y_test, y_pred_svm_unscaled)
    print(f"Accuracy (SVM, Unscaled Numerical Features): {accuracy_svm_unscaled:.4f}")
except Exception as e:
    print(f"Could not train SVM on unscaled data (may be due to large feature values): {e}")
    accuracy_svm_unscaled = 'N/A'


# SVM on Scaled Data
pipeline_svm_scaled = Pipeline(steps=[('preprocessor', preprocessor_scaled),
                                      ('classifier', SVC(gamma='auto', random_state=42))])

pipeline_svm_scaled.fit(X_train, y_train)
y_pred_svm_scaled = pipeline_svm_scaled.predict(X_test)
accuracy_svm_scaled = accuracy_score(y_test, y_pred_svm_scaled)
print(f"Accuracy (SVM, Scaled Numerical Features): {accuracy_svm_scaled:.4f}")

print("\nExplanation:")
print(f"- Decision Trees are generally less affected by feature scaling because they make decisions based on feature thresholds, not distances.")
print(f"- SVMs, which use distance calculations (e.g., in the kernel), are highly sensitive to the scale of features. Scaling typically improves their performance.")
print(f"- The results ({accuracy_dt_unscaled:.4f} vs {accuracy_dt_scaled:.4f} for DT; {accuracy_svm_unscaled} vs {accuracy_svm_scaled:.4f} for SVM) demonstrate this impact.")

print("-" * 30)


# Question 10: Custom Transformations for Categorical Features (Frequency Encoding Example)
print("Question 10: Custom Transformations for Categorical Features (Frequency Encoding Example)")

# Demonstrating a simple custom frequency encoding function
# This is more useful for high cardinality features, but we'll apply it to 'Embarked' for demonstration

def frequency_encode(df, column):
    """
    Applies frequency encoding to a specified column in a DataFrame.
    """
    # Calculate frequency of each category
    freq_map = df[column].value_counts(normalize=True).to_dict()
    # Map frequencies to the column
    df[f'{column}_FreqEncoded'] = df[column].map(freq_map)
    return df

# Apply custom frequency encoding to 'Embarked'
df_freq_encoded = frequency_encode(df.copy(), 'Embarked') # Use a copy to not modify the original df

print("\nDataFrame with 'Embarked' Frequency Encoded:")
print(df_freq_encoded[['Embarked', 'Embarked_FreqEncoded']].head())

# Example of how it might look for a feature like 'Ticket' (if used, needs cleaning/handling)
# print("\nApplying Frequency Encoding to 'Ticket' (Example - requires handling non-numeric):")
# df_freq_encoded_ticket = frequency_encode(df.copy(), 'Ticket')
# print(df_freq_encoded_ticket[['Ticket', 'Ticket_FreqEncoded']].head())
# print("Note: Frequency encoding on 'Ticket' results in many unique values having low frequencies.")


print("\nExplanation: Frequency encoding replaces each category with its frequency (or proportion) in the dataset. This can be useful for high-cardinality features by reducing the number of unique values to a single numerical feature.")

print("-" * 30)


titanic.csv not found. Please download it from Kaggle or another source.
Original DataFrame:
   PassengerId  Survived  Pclass Name     Sex   Age  SibSp  Parch Ticket  \
0            1         0       3    A    male  22.0      1      0      A   
1            2         1       1    B  female  38.0      1      0      B   
2            3         1       3    C  female  26.0      0      0      C   
3            4         1       1    D  female  35.0      1      0      D   
4            5         0       3    E    male  35.0      0      0      E   

    Fare Cabin Embarked  
0   7.25   NaN        S  
1  71.28   C85        C  
2   7.92   NaN        S  
3  53.10  C123        S  
4   8.05   NaN        S  
------------------------------
Question 5: Label Encoding vs One-Hot Encoding for 'Sex'

DataFrame with Label Encoded 'Sex':
      Sex  Sex_LabelEncoded
0    male                 1
1  female                 0
2  female                 0
3  female                 0
4    male                 1



AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names_in'