In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output

# --- 1. Load the Dataset ---
# The assignment mentions 'income_evaluation.csv', which is commonly known as the Adult Income Dataset.
# We load the dataset directly from a common open-source location for simplicity.
# The 'names' list provides correct column headers.
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
                'marital-status', 'occupation', 'relationship', 'race', 'gender',
                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
try:
    # Attempt to load the dataset (URL is a common practice for shared datasets)
    data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    df = pd.read_csv(data_url, names=column_names, sep=r'\s*,\s*', engine='python', na_values="?")
except Exception:
    # Fallback to a small, simplified synthetic dataset if URL access fails
    print("WARNING: Could not load full dataset. Using synthetic data for demonstration.")
    data = {
        'age': [39, 50, 38, 53, 28, 37],
        'workclass': ['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private', 'Private'],
        'education': ['Bachelors', 'Bachelors', 'HS-grad', '11th', 'Bachelors', 'Masters'],
        'hours-per-week': [40, 13, 40, 40, 40, 60],
        'income': ['<=50K', '<=50K', '<=50K', '<=50K', '>50K', '>50K']
    }
    df = pd.DataFrame(data)

# Remove leading/trailing spaces from categorical columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip()


# --- 2. Exploratory Data Analysis (EDA) and Preprocessing ---

print("--- Data Structure and First Rows ---")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
print("-" * 50)

# Check Target Distribution
print("\n--- Target Variable Distribution (income) ---")
# Gradient Boosting is often used for binary classification (income >50K vs <=50K).
print(df['income'].value_counts(normalize=True))

# Handle Missing Values: Drop rows with missing values for simplicity.
# In a real scenario, imputation (e.g., mode/median) is better.
df.dropna(inplace=True)

# Encode Categorical Features (One-Hot Encoding for Nominal)
# Convert all non-numeric columns (like 'workclass', 'gender') into numerical features.
df = pd.get_dummies(df, drop_first=True)

# Separate Features (X) and Target (y)
# The target column must be the encoded income column (>50K).
target_column = 'income_>50K' if 'income_>50K' in df.columns else 'income' # Fallback for synthetic data
X = df.drop(target_column, axis=1)
y = df[target_column]

# --- 3. Divide Dataset into Training and Testing Sets ---
# Splitting the data ensures we test the model on data it has not seen before.
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,      # 30% of data for testing
    random_state=42     # Ensures a fixed, reproducible split
)

print(f"\n--- Data Split Information ---")
print(f"Training Samples: {len(X_train)}")
print(f"Testing Samples: {len(X_test)}")
print("-" * 50)

# --- 4. Standardize Numerical Features (Optional but good practice) ---
# Scaling improves convergence and prevents certain features from dominating the model.
# We apply scaling only to the training and test sets after the split.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


# --- 5. Implement and Train the Gradient Boosting Classifier (GBC) ---
print("--- Training Gradient Boosting Classifier ---")
# GBC builds trees sequentially, where each new tree tries to correct the errors of the previous ones.
# n_estimators: Number of boosting stages (trees).
# learning_rate: Controls the contribution of each tree; smaller requires more trees.
# max_depth: Limits the depth of individual regression trees.
gb_classifier = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# Train the model using the scaled training data.
gb_classifier.fit(X_train_scaled, y_train)

# --- 6. Predict and Evaluate Model Performance ---
# Predict the income class for the unseen test data.
y_pred = gb_classifier.predict(X_test_scaled)

# Calculate Accuracy: Measures the percentage of correct predictions.
accuracy = accuracy_score(y_test, y_pred)

print(f"\n--- Model Evaluation Results ---")
print(f"Accuracy Score: {accuracy:.4f} ({accuracy * 100:.2f}%)")

# Classification Report: Provides precision, recall, and F1-score for each class.
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['<=50K', '>50K']))
print("-" * 50)

--- Data Structure and First Rows ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   gender          32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

First 5 rows:
   age         workclass  fnlwgt  e