<a href="https://colab.research.google.com/github/borisk8180-glitch/Bootcamp25/blob/main/Di-bootcamp/week3/day2/XP/XPw3d3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary library
import pandas as pd

# STEP 1: Load the Titanic dataset
from google.colab import files
files.upload()
df = pd.read_csv("train.csv")

In [None]:
# STEP 2: Display the initial number of rows
print("Number of rows before duplicate removal:", len(df))

In [None]:
# STEP 3: Identify duplicate rows
# duplicated() returns a boolean Series where True means the row is a duplicate
duplicates = df.duplicated()

# Count how many duplicates were found
print("Number of duplicate rows found:", duplicates.sum())

In [None]:
# STEP 4: Remove duplicate rows
# drop_duplicates() removes all rows that are duplicates
df_cleaned = df.drop_duplicates()

In [None]:
# STEP 5: Verify the removal of duplicates
print("Number of rows after duplicate removal:", len(df_cleaned))

# Optional: Save the cleaned dataset to a new CSV file
df_cleaned.to_csv("titanic_cleaned.csv", index=False)

print("Duplicate removal complete. Cleaned dataset saved as 'titanic_cleaned.csv'.")


In [None]:
from sklearn.impute import SimpleImputer

# # STEP 1: Load the Titanic dataset
# df = pd.read_csv("train.csv")

# STEP 2: Identify columns with missing values
print("Missing values per column:")
print(df.isnull().sum())



In [None]:
# -------------------------------------------------------------------------
# STRATEGIES FOR HANDLING MISSING DATA
# -------------------------------------------------------------------------

# STRATEGY A: Removal (drop rows with missing values in critical columns)
# Example: Drop rows where 'Embarked' is missing (few rows only)
df_removed = df.dropna(subset=['Embarked'])
print("\nAfter removing rows with missing 'Embarked':", len(df_removed))

In [None]:
# STRATEGY B: Filling with a constant value
# Example: Fill missing 'Cabin' values with 'Unknown'
df_filled_constant = df.copy()
df_filled_constant['Cabin'] = df_filled_constant['Cabin'].fillna("Unknown")
print("\nNumber of missing 'Cabin' after fillna:", df_filled_constant['Cabin'].isnull().sum())

In [None]:
# STRATEGY C: Filling with statistical measures (imputation)
# Example: Fill missing 'Age' values with the median
df_filled_stat = df.copy()
df_filled_stat['Age'] = df_filled_stat['Age'].fillna(df_filled_stat['Age'].median())
print("\nNumber of missing 'Age' after median imputation:", df_filled_stat['Age'].isnull().sum())

In [None]:
# STRATEGY D: Using scikit-learn SimpleImputer
# Example: Fill missing 'Fare' with the mean using SimpleImputer
df_imputed = df.copy()
imputer = SimpleImputer(strategy='mean')
df_imputed['Fare'] = imputer.fit_transform(df_imputed[['Fare']])
print("\nNumber of missing 'Fare' after SimpleImputer:", df_imputed['Fare'].isnull().sum())

In [None]:
# -------------------------------------------------------------------------
# OPTIONAL: Save processed datasets for comparison
# -------------------------------------------------------------------------
df_removed.to_csv("titanic_removed.csv", index=False)
df_filled_constant.to_csv("titanic_filled_constant.csv", index=False)
df_filled_stat.to_csv("titanic_filled_stat.csv", index=False)
df_imputed.to_csv("titanic_imputed.csv", index=False)

print("\nCleaned datasets saved for comparison.")

In [None]:
# -------------------------------------------------------------------------
# FEATURE ENGINEERING
# -------------------------------------------------------------------------

# STEP 2: Create Family Size feature
# FamilySize = SibSp (siblings/spouses aboard) + Parch (parents/children aboard) + 1 (the passenger themself)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [None]:
# STEP 3: Extract Title from Name
# Example: "Braund, Mr. Owen Harris" -> "Mr"
df['Title'] = df['Name'].str.extract(r',\s*([^\.]+)\.')  # Extracts word(s) between ',' and '.'

# Optional: Simplify rare titles into common groups
df['Title'] = df['Title'].replace(
    ['Lady', 'Countess','Capt', 'Col','Don', 'Dr',
     'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

print("\nUnique Titles extracted:", df['Title'].unique())



In [None]:
# -------------------------------------------------------------------------
# ENCODING CATEGORICAL VARIABLES
# -------------------------------------------------------------------------

# STEP 4: Encode 'Title' using Label Encoding (good for ordinal or few categories)
label_encoder = LabelEncoder()
df['Title_Label'] = label_encoder.fit_transform(df['Title'])

# STEP 5: Encode 'Sex' using Label Encoding (binary category: male/female)
df['Sex_Label'] = label_encoder.fit_transform(df['Sex'])

# STEP 6: One-Hot Encode 'Embarked' (multiple unordered categories)
df = pd.get_dummies(df, columns=['Embarked'], prefix='Embarked')

In [None]:
# -------------------------------------------------------------------------
# CHECK RESULTS
# -------------------------------------------------------------------------
print("\nNew features added: FamilySize, Title, Title_Label, Sex_Label")
print("One-hot encoded columns for Embarked created.")
print("\nPreview of dataset with new features:")
print(df[['Name','FamilySize','Title','Title_Label','Sex','Sex_Label']].head())

In [None]:
# -------------------------------------------------------------------------
# OPTIONAL: Save updated dataset
# -------------------------------------------------------------------------
df.to_csv("titanic_feature_engineered.csv", index=False)
print("\nFeature-engineered dataset saved as 'titanic_feature_engineered.csv'.")

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Choose numeric columns for outlier analysis
numeric_cols = ['Age', 'Fare']

In [None]:
# -------------------------------------------------------------------------
# STEP 2: Visualize distributions with boxplots and histograms
# -------------------------------------------------------------------------
for col in numeric_cols:
    plt.figure(figsize=(10,4))

    plt.subplot(1,2,1)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')

    plt.subplot(1,2,2)
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'Histogram of {col}')

    plt.tight_layout()
    plt.show()

In [None]:
# -------------------------------------------------------------------------
# STEP 3: Detect outliers using IQR and Z-score methods
# -------------------------------------------------------------------------

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] < lower) | (data[column] > upper)]

def detect_outliers_zscore(data, column, threshold=3):
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    return data.loc[data[column].dropna().index[z_scores > threshold]]

for col in numeric_cols:
    print(f"\n--- {col} ---")
    outliers_iqr = detect_outliers_iqr(df, col)
    outliers_zscore = detect_outliers_zscore(df, col)
    print(f"IQR Outliers: {len(outliers_iqr)}")
    print(f"Z-score Outliers: {len(outliers_zscore)}")

In [None]:
# -------------------------------------------------------------------------
# STEP 4: Handle outliers
# -------------------------------------------------------------------------

# (a) Quantile Capping at 0.98
df_capped = df.copy()
for col in numeric_cols:
    upper_cap = df_capped[col].quantile(0.98)
    df_capped[col] = np.where(df_capped[col] > upper_cap, upper_cap, df_capped[col])

# (b) Log Transformation (for skewed features like Fare)
df_log = df.copy()
df_log['Fare'] = np.log1p(df_log['Fare'])  # log1p avoids issues with zero

# (c) Row Removal (remove rows with Age outliers based on IQR)
df_removed = df.copy()
age_outliers = detect_outliers_iqr(df_removed, 'Age').index
df_removed = df_removed.drop(age_outliers)


In [None]:

# -------------------------------------------------------------------------
# STEP 5: Compare datasets before and after treatment
# -------------------------------------------------------------------------
def compare_distributions(original, modified, column, title_suffix):
    plt.figure(figsize=(12,5))

    sns.histplot(original[column], bins=30, color='blue', label='Original', alpha=0.5)
    sns.histplot(modified[column], bins=30, color='red', label=title_suffix, alpha=0.5)

    plt.legend()
    plt.title(f"{column} Distribution: Original vs {title_suffix}")
    plt.show()

# Compare Age and Fare after different treatments
compare_distributions(df, df_capped, 'Fare', 'Capped at 0.98 Quantile')
compare_distributions(df, df_log, 'Fare', 'Log Transformed')
compare_distributions(df, df_removed, 'Age', 'Outliers Removed')


In [None]:

# -------------------------------------------------------------------------
# STEP 6: Explore quantile thresholds
# -------------------------------------------------------------------------
print("\nExploring quantile thresholds for Fare:")
print("0.98 quantile:", df['Fare'].quantile(0.98))
print("0.99 quantile:", df['Fare'].quantile(0.99))

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

# STEP 1: Load Titanic dataset (after outlier treatment, if you saved it)
df = pd.read_csv("titanic_cleaned.csv")  # <-- replace with your treated dataset



In [None]:
# STEP 2: Select numerical features for scaling
numeric_cols = ['Age', 'Fare', 'FamilySize']  # Example set; add more if needed

In [None]:
# -------------------------------------------------------------------------
# STEP 3: Visualize distributions before scaling
# -------------------------------------------------------------------------
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'Distribution of {col} (Before Scaling)')
    plt.show()

In [None]:
# -------------------------------------------------------------------------
# STEP 4: Apply different scalers
# -------------------------------------------------------------------------

# Initialize scalers
standard_scaler = StandardScaler()   # Good for normally distributed features
minmax_scaler = MinMaxScaler()       # Good for skewed or bounded features

# Example decision:
# - 'Age' is roughly normal → StandardScaler
# - 'Fare' is skewed → MinMaxScaler
# - 'FamilySize' is small bounded integers → MinMaxScaler
df_scaled = df.copy()

# Apply StandardScaler to Age
df_scaled['Age_scaled'] = standard_scaler.fit_transform(df[['Age']])

# Apply MinMaxScaler to Fare and FamilySize
df_scaled['Fare_scaled'] = minmax_scaler.fit_transform(df[['Fare']])
df_scaled['FamilySize_scaled'] = minmax_scaler.fit_transform(df[['FamilySize']])

In [None]:
# -------------------------------------------------------------------------
# STEP 5: Visualize distributions after scaling
# -------------------------------------------------------------------------
for col in ['Age_scaled', 'Fare_scaled', 'FamilySize_scaled']:
    plt.figure(figsize=(6,4))
    sns.histplot(df_scaled[col], bins=30, kde=True, color='red')
    plt.title(f'Distribution of {col} (After Scaling)')
    plt.show()

# -------------------------------------------------------------------------
# STEP 6: Check results
# -------------------------------------------------------------------------
print("\nPreview of scaled features:")
print(df_scaled[['Age', 'Age_scaled', 'Fare', 'Fare_scaled', 'FamilySize', 'FamilySize_scaled']].head())

# Optional: Save scaled dataset
df_scaled.to_csv("titanic_scaled.csv", index=False)
print("\nScaled dataset saved as 'titanic_scaled.csv'.")

In [None]:
# Import necessary libraries
from sklearn.preprocessing import LabelEncoder

# -------------------------------------------------------------------------
# STEP 2: Identify categorical columns
# -------------------------------------------------------------------------
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns found:", categorical_cols)

# Example: ['Sex', 'Embarked', 'Title', 'Cabin']

In [None]:
# -------------------------------------------------------------------------
# STEP 3: Decide encoding strategy
# -------------------------------------------------------------------------
# - Nominal variables (unordered categories) → One-Hot Encoding
#   Examples: Sex, Embarked, Title
# - Ordinal variables (ordered categories, if any) → Label Encoding
#   Example: a column like 'Education' = Primary < Secondary < Tertiary

# For this Titanic dataset, we assume all are nominal (no natural order).

# -------------------------------------------------------------------------
# STEP 4: One-Hot Encoding for nominal variables
# -------------------------------------------------------------------------
# Use get_dummies() to create dummy variables
df_encoded = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title'], drop_first=True)

# drop_first=True removes one category to avoid multicollinearity
# Example: for 'Sex', only 'Sex_male' remains (0=female, 1=male)



In [None]:
# -------------------------------------------------------------------------
# STEP 5: Label Encoding (if there were ordinal features)
# -------------------------------------------------------------------------
# Example placeholder for an ordinal column 'ClassLevel'
if 'ClassLevel' in df.columns:
    label_encoder = LabelEncoder()
    df_encoded['ClassLevel'] = label_encoder.fit_transform(df['ClassLevel'])



In [None]:
# -------------------------------------------------------------------------
# STEP 6: Verify encoding
# -------------------------------------------------------------------------
print("\nPreview of encoded dataset:")
print(df_encoded.head())

print("\nEncoded columns now include:")
print([col for col in df_encoded.columns if any(cat in col for cat in ['Sex_', 'Embarked_', 'Title_'])])



In [None]:
# -------------------------------------------------------------------------
# STEP 7: Save final dataset
# -------------------------------------------------------------------------
df_encoded.to_csv("titanic_encoded.csv", index=False)
print("\nFinal encoded dataset saved as 'titanic_encoded.csv'.")


In [None]:
# Import necessary libraries
import pandas as pd

# STEP 1: Load dataset (after missing value handling and outlier treatment)
df = pd.read_csv("titanic_cleaned.csv")  # replace with your processed dataset

# -------------------------------------------------------------------------
# STEP 2: Create Age Groups using pd.cut()
# -------------------------------------------------------------------------
# Define bins (edges) and corresponding labels
age_bins = [0, 12, 18, 60, 100]               # child, teen, adult, senior
age_labels = ['Child', 'Teen', 'Adult', 'Senior']

# Use pd.cut() to assign each passenger to a group
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

# right=False → interval is [ ) so 0 ≤ age < 12 is Child, etc.

print("\nUnique age groups created:", df['AgeGroup'].unique())

# -------------------------------------------------------------------------
# STEP 3: One-Hot Encode AgeGroup
# -------------------------------------------------------------------------
df_encoded = pd.get_dummies(df, columns=['AgeGroup'], prefix='AgeGroup')

# Example: AgeGroup_Child, AgeGroup_Teen, AgeGroup_Adult, AgeGroup_Senior

# -------------------------------------------------------------------------
# STEP 4: Verify new columns
# -------------------------------------------------------------------------
print("\nPreview of dataset with AgeGroup encoding:")
print(df_encoded[['Age', 'AgeGroup_Child', 'AgeGroup_Teen', 'AgeGroup_Adult', 'AgeGroup_Senior']].head(10))

# -------------------------------------------------------------------------
# STEP 5: Save updated dataset
# -------------------------------------------------------------------------
df_encoded.to_csv("titanic_agegroup_encoded.csv", index=False)
print("\nDataset with AgeGroup feature saved as 'titanic_agegroup_encoded.csv'.")
