<a href="https://colab.research.google.com/github/bhaskarba82/dice/blob/master/ML_Classification_Assignment_Adult_Income.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Classification Assignment
## Dataset: Adult Income (UCI)

This notebook implements 6 classification models on the Adult Income dataset and evaluates them using:
- Accuracy
- AUC Score
- Precision
- Recall
- F1 Score
- Matthews Correlation Coefficient (MCC)


In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


## Load Dataset

In [3]:
# Load dataset (Ensure adult.csv is in the same directory)

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

columns = [
    "age","workclass","fnlwgt","education","education-num",
    "marital-status","occupation","relationship","race",
    "sex","capital-gain","capital-loss","hours-per-week","native-country","income"
]

data = pd.read_csv(url, names=columns, na_values=" ?")
data.head()

#data = pd.read_csv('adult.csv')
#data.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Data Preprocessing

In [4]:
# Handle missing values
data = data.replace('?', np.nan)
data = data.dropna()

# Encode target variable
le = LabelEncoder()
data['income'] = le.fit_transform(data['income'])

# One-hot encoding
data = pd.get_dummies(data, drop_first=True)

X = data.drop('income', axis=1)
y = data['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## Model Training and Evaluation

In [5]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'AUC': roc_auc_score(y_test, y_prob),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'MCC': matthews_corrcoef(y_test, y_pred)
    }

results_df = pd.DataFrame(results).T
results_df


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC
Logistic Regression,0.852975,0.907738,0.756994,0.618954,0.68105,0.591786
Decision Tree,0.812531,0.757259,0.626667,0.645098,0.635749,0.509668
KNN,0.813194,0.831738,0.653465,0.560784,0.603588,0.484737
Naive Bayes,0.450191,0.69494,0.310899,0.960131,0.469704,0.250291
Random Forest,0.85082,0.902255,0.736486,0.641176,0.685535,0.590791
XGBoost,0.870214,0.927999,0.781038,0.678431,0.726128,0.644366


## Conclusion
Compare all six models based on the evaluation metrics above.
Typically, Random Forest and XGBoost perform better due to ensemble learning techniques.
