In [None]:
# Step 1: Install and Import Libraries

!pip install pandas numpy -q

import pandas as pd
import numpy as np


In [None]:
# Step 2: Load the Titanic Dataset

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Step 3: Data Cleaning
# 1. Detect Missing Values and Data Types

print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int6

In [None]:
# 2. Drop Irrelevant Columns

df.drop(columns=["PassengerId","Ticket","Cabin"], inplace=True)


In [None]:
# Impute Missing Values

# Age: fill with median
df["Age"].fillna(df["Age"].median(), inplace=True)
# Embarked: fill with mode
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)


In [None]:
# Handle Inconsistencies & Duplicates

# Ensure no negative fares
df = df[df["Fare"] >= 0]
# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Step 4: Feature Engineering
# 1.Title Extraction from Name

df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
df["Title"] = df["Title"].replace(["Lady","Countess","Capt","Col","Don","Dr","Major","Rev","Sir","Jonkheer","Dona"], "Rare")


In [None]:
# Family Size & IsAlone Flag

df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df["IsAlone"] = np.where(df["FamilySize"] == 1, 1, 0)


In [None]:
# One-Hot Encoding Categorical Features

df_final = pd.get_dummies(df, columns=["Sex","Embarked","Title"], drop_first=True)


In [None]:
def clean_data(df):
    df = df.drop(columns=["PassengerId","Ticket","Cabin"], errors="ignore")
    df = df.copy()  # ensure we work on a fresh copy
    # Fill missing Age and Embarked by direct assignment
    median_age = df["Age"].median()
    df["Age"] = df["Age"].fillna(median_age)
    mode_embarked = df["Embarked"].mode()[0]
    df["Embarked"] = df["Embarked"].fillna(mode_embarked)
    df = df[df["Fare"] >= 0].drop_duplicates()
    return df

def engineer_features(df):
    df = df.copy()
    # Extract and binarize Title
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.")
    rare_titles = ["Lady","Countess","Capt","Col","Don","Dr","Major",
                   "Rev","Sir","Jonkheer","Dona"]
    df["Title"] = df["Title"].replace(rare_titles, "Rare")
    # Family size and alone flag
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = np.where(df["FamilySize"] == 1, 1, 0)
    # Binning
    df["FareBin"] = pd.qcut(df["Fare"], 4, labels=False)
    df["AgeBin"]  = pd.cut(df["Age"], bins=[0,12,20,40,60,100], labels=False)
    # One-hot encoding via direct assignment
    df = pd.get_dummies(df, columns=["Sex","Embarked","Title"], drop_first=True)
    return df

# Run pipeline
df_clean = clean_data(df)
df_prepared = engineer_features(df_clean)
df_prepared.head()


Unnamed: 0,Survived,Pclass,Name,Age,SibSp,Parch,Fare,FamilySize,IsAlone,FareBin,...,Embarked_Q,Embarked_S,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rare,Title_the Countess
0,0,3,"Braund, Mr. Owen Harris",22.0,1,0,7.25,2,0,0,...,False,True,False,False,False,True,False,False,False,False
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,71.2833,2,0,3,...,False,False,False,False,False,False,True,False,False,False
2,1,3,"Heikkinen, Miss. Laina",26.0,0,0,7.925,1,1,1,...,False,True,True,False,False,False,False,False,False,False
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,53.1,2,0,3,...,False,True,False,False,False,False,True,False,False,False
4,0,3,"Allen, Mr. William Henry",35.0,0,0,8.05,1,1,1,...,False,True,False,False,False,True,False,False,False,False


In [None]:
# Step 6: Save the Prepared Dataset
df_prepared.to_csv("titanic_prepared.csv", index=False)


Housing Prices Pipeline

In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [None]:
# Step 2: Load Dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
housing = pd.read_csv(url)
housing.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [None]:
# Check data info
print(housing.info())

# Check missing values
print("\nMissing Values:\n", housing.isnull().sum())

# Describe stats for outlier detection
print("\nStatistical Summary:\n", housing.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None

Missing Values:
 longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value  

In [None]:
housing["total_bedrooms"].fillna(housing["total_bedrooms"].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing["total_bedrooms"].fillna(housing["total_bedrooms"].median(), inplace=True)


In [None]:
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"] = housing["population"] / housing["households"]


In [None]:
housing = pd.get_dummies(housing, columns=["ocean_proximity"], drop_first=True)


In [None]:
# Select numeric columns
numeric_cols = housing.select_dtypes(include=["float64", "int64"]).columns

# Initialize scaler
scaler = StandardScaler()

# Apply standardization
housing[numeric_cols] = scaler.fit_transform(housing[numeric_cols])


In [None]:
def clean_housing_data(df):
    df = df.copy()
    # Fill missing Age and Embarked by direct assignment
    median_bedrooms = df["total_bedrooms"].median()
    df["total_bedrooms"] = df["total_bedrooms"].fillna(median_bedrooms)
    return df

def engineer_housing_features(df):
    df = df.copy()
    df["rooms_per_household"] = df["total_rooms"] / df["households"]
    df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
    df["population_per_household"] = df["population"] / df["households"]
    # One-hot encoding via direct assignment
    df = pd.get_dummies(df, columns=["ocean_proximity"], drop_first=True)
    return df

def scale_numeric_features(df):
    df = df.copy()
    numeric_cols = df.select_dtypes(include=["float64", "int64", "bool"]).columns # Include boolean columns
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

In [None]:
def engineer_housing_features(df):
    df = df.copy()
    df["rooms_per_household"] = df["total_rooms"] / df["households"]
    df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
    df["population_per_household"] = df["population"] / df["households"]

    # Check before encoding
    if "ocean_proximity" in df.columns:
        df = pd.get_dummies(df, columns=["ocean_proximity"], drop_first=True)

    return df


In [None]:
# Step-by-step with fresh data
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
housing = pd.read_csv(url)

df_clean = clean_housing_data(housing)
df_feat = engineer_housing_features(df_clean)
df_final = scale_numeric_features(df_feat)

print("✅ Final shape:", df_final.shape)
df_final.head()


✅ Final shape: (20640, 16)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,2.129631,0.628559,-1.029988,-0.049597,-0.681889,-0.015566,2.830742,-0.384466
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,1.314156,0.327041,-0.888897,-0.092512,-0.681889,-0.015566,2.830742,-0.384466
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,1.258693,1.15562,-1.291686,-0.025843,-0.681889,-0.015566,2.830742,-0.384466
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,1.1651,0.156966,-0.449613,-0.050329,-0.681889,-0.015566,2.830742,-0.384466
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,1.1729,0.344711,-0.639087,-0.085616,-0.681889,-0.015566,2.830742,-0.384466


Credit Card Transactions

In [None]:
import pandas as pd
import numpy as np


In [None]:
# Load the CSV
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_excel(url, skiprows=1)  # First row is repeated header
df.columns = df.columns.str.lower().str.replace(" ", "_")  # Clean headers

df.head()

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [None]:
def clean_data(df):
    df = df.copy()

    # 1. Rename columns again in case user passes raw file
    df.columns = df.columns.str.lower().str.replace(" ", "_")

    # 2. Remove non-numeric or invalid (e.g., negative payments/bills)
    bill_cols = [col for col in df.columns if col.startswith("bill_amt")]
    pay_cols = [col for col in df.columns if col.startswith("pay_amt")]

    # Optionally, convert all to numeric (if needed)
    df[bill_cols + pay_cols] = df[bill_cols + pay_cols].apply(pd.to_numeric, errors='coerce')

    # 3. Replace negatives with 0 (business logic; alternative: np.nan + imputation)
    df[bill_cols + pay_cols] = df[bill_cols + pay_cols].clip(lower=0)

    # 4. Drop rows with nulls if any remain
    df.dropna(inplace=True)

    return df


In [None]:
def engineer_features(df):
    df = df.copy()

    # 1. Average bill and payment over 6 months
    bill_cols = [f"bill_amt{i}" for i in range(1, 7)]
    pay_cols = [f"pay_amt{i}" for i in range(1, 7)]

    df["avg_bill_amt"] = df[bill_cols].mean(axis=1)
    df["avg_pay_amt"] = df[pay_cols].mean(axis=1)

    # 2. Payment-to-bill ratio
    df["pay_bill_ratio"] = np.where(df["avg_bill_amt"] > 0, df["avg_pay_amt"] / df["avg_bill_amt"], 0)

    # 3. Bin Age into Decades
    df["age_group"] = pd.cut(df["age"], bins=[0, 20, 30, 40, 50, 60, 100],
                             labels=["<20", "20s", "30s", "40s", "50s", "60+"], right=False)

    # 4. One-hot encode categorical fields
    df = pd.get_dummies(df, columns=["sex", "education", "marriage", "age_group"], drop_first=True)

    return df


In [None]:
import pandas as pd

# URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"

# Fix: Specify the correct encoding and skip first row (header repeats)
df_raw = pd.read_excel(url, skiprows=1)

# Clean column names
df_raw.columns = df_raw.columns.str.lower().str.replace(" ", "_").str.replace("__", "_")

df_raw.head()

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
