# **Step 1: Initial Setup and Data Loading**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
           'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
           'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, header=None, names=columns, na_values=" ?")

# Inspect data
data.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# **Step 2: Data Preprocessing**



In [2]:
# Split features and target variable
X = data.drop('income', axis=1)
y = data['income']

# Define numerical and categorical columns
numeric_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X = preprocessor.fit_transform(X)


# **Apply Unsupervised Learning Techniques AND Validation Technique**

In [8]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Dimensionality reduction with PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

# K-Means Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Silhouette Score
silhouette_avg = silhouette_score(X_pca, clusters)
print(f'Silhouette Score: {silhouette_avg}')


Silhouette Score: 0.3548837823033352


# **Feature Selection**

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Use a Random Forest Classifier for feature selection
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)
sfm = SelectFromModel(clf, threshold=0.01)
X_selected = sfm.fit_transform(X, y)

# Check selected features
print(f'Selected Features Shape: {X_selected.shape}')


Selected Features Shape: (32561, 14)


# **Encode Target Variable (y)**

In [10]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable 'income' to numerical values
le = LabelEncoder()
y = le.fit_transform(y)  # This will map ' <=50K' to 0 and ' >50K' to 1

# Check the unique values in the target after encoding
print(f"Unique values in 'y' after encoding: {np.unique(y)}")


Unique values in 'y' after encoding: [0 1]


# **Applying Five Supervised Learning Classifiers (Including Boosting)**
## Evaluating Classifiers Performance with Metrics
(Accuracy, Precision, Recall, F1-Score, Confusion Matrix)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split data (use transformed data if feature selection is applied)
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Print results for each classifier
    print(f'---{name}---')
    print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
    print(f'Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}')
    print(f'Classification Report:\n{classification_report(y_test, y_pred)}')
    print('-' * 50)


---Logistic Regression---
Accuracy: 0.8462
Confusion Matrix:
[[6960  495]
 [1007 1307]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90      7455
           1       0.73      0.56      0.64      2314

    accuracy                           0.85      9769
   macro avg       0.80      0.75      0.77      9769
weighted avg       0.84      0.85      0.84      9769

--------------------------------------------------
---Decision Tree---
Accuracy: 0.8127
Confusion Matrix:
[[6482  973]
 [ 857 1457]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      7455
           1       0.60      0.63      0.61      2314

    accuracy                           0.81      9769
   macro avg       0.74      0.75      0.75      9769
weighted avg       0.82      0.81      0.81      9769

--------------------------------------------------
---SVM---
Accuracy: 0.8519
C

# **Part (vi) – Manual Feature Extraction and Achieving High Accuracy**

**Load and Inspect the Dataset**

In [12]:
from sklearn.datasets import load_iris
import pandas as pd

# Load the Iris dataset
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add target labels
df['species'] = data.target
df['species'] = df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Display first few rows
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


**Manual Feature Extraction**

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Select features manually (combination of two features for simplicity)
X = df[['sepal length (cm)', 'petal length (cm)']]  # Choose a pair of features
y = df['species']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply classifier (Support Vector Machine)
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 100.00%


# **Part (vii) – Automated Feature Extraction**

**Principal Component Analysis (PCA)**

In [14]:
from sklearn.decomposition import PCA

# Apply PCA to reduce to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Train-Test Split for PCA-transformed data
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Apply classifier (Support Vector Machine)
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Accuracy after PCA
accuracy_pca = accuracy_score(y_test, y_pred)
print(f'Accuracy after PCA: {accuracy_pca * 100:.2f}%')


Accuracy after PCA: 100.00%


**Recursive Feature Elimination (RFE)**

In [15]:
from sklearn.feature_selection import RFE

# Apply RFE (Recursive Feature Elimination)
rfe = RFE(estimator=SVC(kernel='linear'), n_features_to_select=2)
X_rfe = rfe.fit_transform(X, y)

# Train-Test Split for RFE-transformed data
X_train, X_test, y_train, y_test = train_test_split(X_rfe, y, test_size=0.3, random_state=42)

# Apply classifier (Support Vector Machine)
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Accuracy after RFE
accuracy_rfe = accuracy_score(y_test, y_pred)
print(f'Accuracy after RFE: {accuracy_rfe * 100:.2f}%')


Accuracy after RFE: 100.00%
