In [None]:
# Step 1: Install and Import Libraries
!pip install pandas numpy scikit-learn -q

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.exceptions import DataConversionWarning
import warnings

# Suppress warnings for clearer output
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


In [None]:
# Step 2: Load and Split the Dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Split into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Survived'])


In [None]:
# Step 3: Define the Pipeline Functions
def clean_data(df):
    df = df.drop(columns=["PassengerId","Ticket","Cabin"], errors="ignore").copy()
    # Impute Age and Embarked
    df["Age"] = df["Age"].fillna(df["Age"].median())
    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
    return df

def engineer_features(df):
    df = df.copy()
    # Title extraction
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
    rare_titles = ["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"]
    df["Title"] = df["Title"].replace(rare_titles, "Rare")
    # Family size & is alone
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)
    # Fare and Age bins
    df["FareBin"] = pd.qcut(df["Fare"].fillna(0), 4, labels=False)
    df["AgeBin"]  = pd.cut(df["Age"], bins=[0,12,20,40,60,100], labels=False)
    # Drop unused columns
    df = df.drop(columns=["Name","SibSp","Parch"])
    # One-hot encode
    df = pd.get_dummies(df, columns=["Sex","Embarked","Title"], drop_first=True)
    return df

def validate_data(df):
    errors = []
    # Check for nulls
    null_counts = df.isnull().sum()
    if null_counts.any():
        errors.append(f"Null values found:\n{null_counts[null_counts>0]}")
    # Check expected columns
    expected_cols = {"Survived","Pclass","Age","Fare","FamilySize","IsAlone","FareBin","AgeBin"}
    missing = expected_cols - set(df.columns)
    if missing:
        errors.append(f"Missing columns: {missing}")
    return errors

In [None]:
# Step 4: Execute the Pipeline
# Clean and feature-engineer training data
train_clean = clean_data(train_df)
train_feat  = engineer_features(train_clean)
train_errors = validate_data(train_feat)
print("Train validation errors:", train_errors or "None")

# Clean and feature-engineer validation data
val_clean = clean_data(val_df)
val_feat  = engineer_features(val_clean)
val_errors = validate_data(val_feat)
print("Validation validation errors:", val_errors or "None")

Train validation errors: None
Validation validation errors: None


In [None]:
# Step 5: Save Prepared Data
train_feat.to_csv("titanic_train_prepared.csv", index=False)
val_feat.to_csv("titanic_val_prepared.csv", index=False)


In [None]:
def validate_data(df):
    errors = []
    null_counts = df.isnull().sum()
    if null_counts.any():
        errors.append(f"Null values found:\n{null_counts[null_counts>0]}")
    expected_cols = {"Survived","Pclass","Age","Fare","FamilySize","IsAlone","FareBin","AgeBin"}
    missing = expected_cols - set(df.columns)
    if missing:
        errors.append(f"Missing columns: {missing}")
    return errors


 Diabetes Risk Prediction Pipeline

In [None]:
import pandas as pd
import numpy as np


In [None]:
# Load the data
url = "https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv"
df = pd.read_csv(url)

df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
def clean_data(df):
    df = df.copy()

    # These columns shouldn't have zero values
    zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

    # Replace 0s with median
    for col in zero_cols:
        df[col] = df[col].replace(0, np.nan)  # Temporarily set to NaN
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)

    # Drop duplicate rows
    df.drop_duplicates(inplace=True)

    return df


In [None]:
def engineer_features(df):
    df = df.copy()

    # BMI categories
    df["BMI_Category"] = pd.cut(df["BMI"],
                                bins=[0, 18.5, 24.9, 29.9, np.inf],
                                labels=["Underweight", "Normal", "Overweight", "Obese"])

    # Age binning (e.g., 20s, 30s, etc.)
    df["AgeBin"] = pd.cut(df["Age"],
                          bins=[20, 30, 40, 50, 60, 70, 80],
                          labels=["20s", "30s", "40s", "50s", "60s", "70s"],
                          right=False)

    # Interaction feature
    df["Glucose_Insulin"] = df["Glucose"] * df["Insulin"]

    return df


In [None]:
def validate_data(df):
    errors = []

    # 1. Null check
    nulls = df.isnull().sum()
    if nulls.any():
        errors.append(f"Nulls found in:\n{nulls[nulls > 0]}")

    # 2. Validate category coverage
    bmi_expected = {"Underweight", "Normal", "Overweight", "Obese"}
    agebin_expected = {"20s", "30s", "40s", "50s", "60s", "70s"}

    if not set(df["BMI_Category"].unique()) <= bmi_expected:
        errors.append("Unexpected values in BMI_Category")

    if not set(df["AgeBin"].dropna().unique()) <= agebin_expected:
        errors.append("Unexpected values in AgeBin")

    return errors


In [None]:
# Load data again
df_raw = pd.read_csv(url)

# Apply steps
df_clean = clean_data(df_raw)
df_feat = engineer_features(df_clean)
validation_errors = validate_data(df_feat)

# Show results
print("✅ Validation Errors:", validation_errors or "None")
df_feat.head()


✅ Validation Errors: ['Nulls found in:\nAgeBin    1\ndtype: int64']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_Category,AgeBin,Glucose_Insulin
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1,Obese,50s,18500.0
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0,Overweight,30s,10625.0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1,Normal,30s,22875.0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0,Overweight,20s,8366.0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1,Obese,30s,23016.0


Customer Churn Prediction Pipeline

In [None]:
import pandas as pd
import numpy as np


In [None]:
import pandas as pd

# Corrected and working URL
url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"

# Load dataset
df = pd.read_csv(url)

# Preview
print(df.head(9))


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   
5  9305-CDSKC  Female              0      No         No       8          Yes   
6  1452-KIOVK    Male              0      No        Yes      22          Yes   
7  6713-OKOMC  Female              0      No         No      10           No   
8  7892-POOKP  Female              0     Yes         No      28          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            

In [None]:
def clean_data(df):
    df = df.copy()

    # Convert TotalCharges to numeric and coerce errors to NaN
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

    # Impute missing TotalCharges with median
    df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

    # Drop customerID
    df.drop(columns=['customerID'], inplace=True)

    return df


In [None]:
def engineer_features(df):
    df = df.copy()

    # Create tenure_group
    bins = [0, 12, 24, 36, 48, 60, 72]
    labels = ['0-12', '13-24', '25-36', '37-48', '49-60', '61-72']
    df['tenure_group'] = pd.cut(df['tenure'], bins=bins, labels=labels, right=True)

    # Avoid division by zero
    df['avg_charges_per_month'] = df.apply(
        lambda row: row['TotalCharges'] / row['tenure'] if row['tenure'] > 0 else 0, axis=1)

    # One-hot encoding for categorical columns
    df = pd.get_dummies(df, columns=['Contract', 'PaymentMethod'], drop_first=True)

    return df


In [None]:
def validate_data(df):
    errors = []

    # Check for infinite values
    if np.isinf(df['avg_charges_per_month']).any():
        errors.append("Infinite values found in avg_charges_per_month.")

    # Check if all tenure groups are present
    expected_groups = {'0-12', '13-24', '25-36', '37-48', '49-60', '61-72'}
    actual_groups = set(df['tenure_group'].dropna().unique().astype(str))
    if not expected_groups.issubset(actual_groups):
        missing = expected_groups - actual_groups
        errors.append(f"Missing tenure_group labels: {missing}")

    return errors


In [None]:
# Load raw data
df_raw = pd.read_csv(url)

# Apply pipeline steps
df_clean = clean_data(df_raw)
df_feat = engineer_features(df_clean)
validation_errors = validate_data(df_feat)

# Show results
print("✅ Validation Errors:", validation_errors or "None")
df_feat.head()


✅ Validation Errors: None


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,MonthlyCharges,TotalCharges,Churn,tenure_group,avg_charges_per_month,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,29.85,29.85,No,0-12,29.85,False,False,False,True,False
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,56.95,1889.5,No,25-36,55.573529,True,False,False,False,True
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,53.85,108.15,Yes,0-12,54.075,False,False,False,False,True
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,42.3,1840.75,No,37-48,40.905556,True,False,False,False,False
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,70.7,151.65,Yes,0-12,75.825,False,False,False,True,False


House Price Modeling Pipeline

In [None]:
import pandas as pd
import numpy as np

# Load dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)

# Check for nulls
print(df.isnull().sum())

# Impute total_bedrooms with median
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

# Drop 'ocean_proximity' outliers if needed
print("Categories in ocean_proximity:", df['ocean_proximity'].unique())
# (No numerical outliers to drop here; keeping categorical encoding for later use)


longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64
Categories in ocean_proximity: ['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)


In [None]:
# rooms_per_household = total_rooms / households
df['rooms_per_household'] = df['total_rooms'] / df['households']

# bedrooms_per_room = total_bedrooms / total_rooms
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']

# population_per_household = population / households
df['population_per_household'] = df['population'] / df['households']

# Bin median_income into quartiles
df['income_bin'] = pd.qcut(df['median_income'], q=4, labels=False)

# Log transform median_house_value
df['log_median_house_value'] = np.log1p(df['median_house_value'])


In [None]:
# Ensure no zero or negative values in ratio features
ratio_cols = ['rooms_per_household', 'bedrooms_per_room', 'population_per_household']
for col in ratio_cols:
    assert (df[col] > 0).all(), f"{col} contains non-positive values!"

# Check log transform values are finite
assert np.isfinite(df['log_median_house_value']).all(), "Log-transformed values contain non-finite entries!"
