<a href="https://colab.research.google.com/github/a-mufasa/CSCE-4143-Practice-Project/blob/ahmed-2/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSCE 4143 Practice Project

## Download the Data

#### Get the data into a Python object for both parts and inspect the data for each column's data type and values

In [134]:
import pandas as pd
import numpy as np

# Create Train & Test DataFrames from Adult Dataset files
data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' 
test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'

# Part 1 DataFrames
train_df1 = pd.read_csv(data, header=None, na_values=" ?")
test_df1 = pd.read_csv(test, skiprows=1, na_values=" ?")

# Add columns to Train DataFrames using adult.names column descriptions
columns = [
  'age',
  'workclass',
  'fnlwgt',
  'education',
  'education-num',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'capital-gain',
  'capital-loss',
  'hours-per-week',
  'native-country',
  'income',
 ]

train_df1.columns = columns
test_df1.columns = columns

# Part 2-4 DataFrames
train_df2 = train_df1
test_df2 = test_df1

# Print datatypes & training DataFrame
print(train_df1.dtypes, end='\n----------------------------')
train_df1

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object
----------------------------

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## Function for metrics

In [135]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def metrics(y_test, predictions):
  cm = confusion_matrix(y_test, predictions)
  FP = cm.sum(axis=0) - np.diag(cm)  
  FN = cm.sum(axis=1) - np.diag(cm)
  TP = np.diag(cm)
  TN = cm.sum() - (FP + FN + TP)

  TPR = dict(zip(['<=50K', '>50k'], (TP/(TP+FN))))
  FPR = dict(zip(['<=50K', '>50k'], (FP/(FP+TN))))

  print(f"Accuracy = {accuracy_score(y_test, predictions)}\n")
  print(f"TP rate = {TPR}\n")
  print(f"FP rate = {FPR}\n")
  print(classification_report(y_test, predictions))


## 1)  Decision Tree and Bayesian Classifier

### Data Preparation

#### Remove records with unknown ( ?) values and continuous attributes from both train and test data sets

In [136]:
# Drop rows with null values
train_df1.dropna(inplace = True)
train_df1.dropna(inplace = True)

# Drop continuous attributes
continuous_attributes = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

train_df1 = train_df1.drop(continuous_attributes, axis = 1)
test_df1 = test_df1.drop(continuous_attributes, axis = 1)

# Fix income column of test_df (remove unnecessary '.')
test_df1['income'] = test_df1['income'].str.replace(".","", regex=False)

# Print training DataFrame
train_df1

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


#### One-hot encode columns

In [137]:
# Function to replace encodable features with their encoded column
def oneHotEncode(df, feature):
  dummies = pd.get_dummies(df[feature])
  new_df = pd.concat([df, dummies], axis=1).drop(feature, axis=1)
  return(new_df)

# Columns to be one-hot encoded (multi-domain categorical attribute)
encodable_columns = [
  'workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country',
]

train_df1 = oneHotEncode(train_df1, encodable_columns)
test_df1 = oneHotEncode(test_df1, encodable_columns)

#### Split DataFrames into features & target

In [138]:
# Inner join to only include columns that exist in both train & test (removes 1 from train)
train_df1, test_df1 = train_df1.align(test_df1, join='inner', axis=1)

# X and Y columns for later evaluation
target = 'income'

X_train = train_df1.loc[:, train_df1.columns != target]
Y_train = train_df1[target]

X_test = test_df1.loc[:, test_df1.columns != target]
Y_test = test_df1[target]

### Decision Tree Classifier (Single Tree)

In [139]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
predictions = dt.predict(X_test)

metrics(Y_test, predictions)

Accuracy = 0.8172604422604423

TP rate = {'<=50K': 0.8976998552356442, '>50k': 0.5572022880915236}

FP rate = {'<=50K': 0.44279771190847633, '>50k': 0.1023001447643558}

              precision    recall  f1-score   support

       <=50K       0.87      0.90      0.88     12434
        >50K       0.63      0.56      0.59      3846

    accuracy                           0.82     16280
   macro avg       0.75      0.73      0.74     16280
weighted avg       0.81      0.82      0.81     16280



### Naïve Bayesian Classifier 

In [140]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, Y_train)
predictions = nb.predict(X_test)

metrics(Y_test, predictions)

Accuracy = 0.562960687960688

TP rate = {'<=50K': 0.45311243364967024, '>50k': 0.9180967238689548}

FP rate = {'<=50K': 0.08190327613104524, '>50k': 0.5468875663503298}

              precision    recall  f1-score   support

       <=50K       0.95      0.45      0.61     12434
        >50K       0.34      0.92      0.50      3846

    accuracy                           0.56     16280
   macro avg       0.64      0.69      0.56     16280
weighted avg       0.80      0.56      0.59     16280



## 2) K-Means, kNN, SVM, and Neural Network 

### Data Preparation

#### Remove records with unknown ( ?) values

In [141]:
# Drop rows with null values
train_df2.dropna(inplace = True)
test_df2.dropna(inplace = True)

# Fix income column of test_df (remove unnecessary '.')
test_df2['income'] = test_df2['income'].str.replace(".","", regex=False)

# Print training DataFrame
train_df2

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


#### One-hot encode columns

In [142]:
# Function to replace encodable features with their encoded column
def oneHotEncode(df, feature):
  dummies = pd.get_dummies(df[feature])
  new_df = pd.concat([df, dummies], axis=1).drop(feature, axis=1)
  return(new_df)

# Columns to be one-hot encoded (multi-domain categorical attribute)
encodable_columns = [
  'workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country',
]

train_df2 = oneHotEncode(train_df2, encodable_columns)
test_df2 = oneHotEncode(test_df2, encodable_columns)

#### Transform numerical attributes to binary using their mean values

In [143]:
# Function to replace numerical features with their binary feature
def numerical_to_binary(df, features):
  for feature in features:
    mean = df[feature].mean()
    df.loc[(df[feature] < mean), feature] = 0
    df.loc[(df[feature] >= mean), feature] = 1

# Columns to be converted to binary
numerical_columns = [
    'age', 
    'fnlwgt',
    'education-num',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
]
numerical_to_binary(train_df2, numerical_columns)
numerical_to_binary(test_df2, numerical_columns)

#### Split DataFrames into features & target

In [144]:
# Inner join to only include columns that exist in both train & test (removes 1 from train)
train_df2, test_df2 = train_df2.align(test_df2, join='inner', axis=1)

# X and Y columns for later evaluation
target = 'income'

X_train = train_df2.loc[:, train_df2.columns != target]
Y_train = train_df2[target]

X_test = test_df2.loc[:, test_df2.columns != target]
Y_test = test_df2[target]

### K-means Clustering Algorithm

In [145]:
from sklearn.cluster import KMeans

# K = 3, algorithm = lloyd
km3 = KMeans(n_clusters=3, random_state=0).fit(X_train, Y_train)
print(f"K=3 centroids: \n{km3.cluster_centers_}\n")

# K = 5, algorithm = lloyd
km5 = KMeans(n_clusters=5, random_state=0).fit(X_train, Y_train)
print(f"K=5 centroids: \n{km5.cluster_centers_}\n")

# K = 10, algorithm = lloyd
km10 = KMeans(n_clusters=10, random_state=0).fit(X_train, Y_train)
print(f"K=10 centroids: \n{km10.cluster_centers_}\n")

K=3 centroids: 
[[ 2.79105431e-01  4.78466454e-01  2.74249201e-01  5.38019169e-02
   3.46325879e-02  2.84984026e-01  2.70926518e-02  4.93290735e-02
   7.88370607e-01  2.41533546e-02  7.33546326e-02  3.71884984e-02
   5.11182109e-04  3.47603834e-02  4.89456869e-02  2.03194888e-02
   5.36741214e-03  1.16293930e-02  1.63578275e-02  1.82747604e-02
   3.11821086e-02  3.46325879e-02  1.52971246e-01  6.90095847e-03
   3.39808307e-01  3.64217252e-02  2.30031949e-03  1.21405751e-02
   2.27987220e-01  2.15335463e-01 -2.04914211e-17  4.98402556e-03
   2.31309904e-02  6.91884984e-01  4.66453674e-02  1.80191693e-02
   7.71884984e-02  7.66773163e-04  1.69329073e-01  8.85623003e-02
   4.69009585e-02  9.61022364e-02  7.01597444e-02  1.21022364e-01
   8.94568690e-04  1.00830671e-01  2.50479233e-02  1.08242812e-01
   2.84984026e-02  6.64536741e-02  1.14908083e-14  5.31246006e-01
   5.54632588e-02  3.17571885e-01  9.57188498e-02  1.57512892e-15
   1.13738019e-02  2.95207668e-02  1.01725240e-01  8.5623003

### KNN


#### Function for KNN fit with variable k

In [159]:
from sklearn.neighbors import KNeighborsClassifier

def knn(k, X_train, Y_train, X_test):
  knn = KNeighborsClassifier(n_neighbors=k)
  knn.fit(X_train, Y_train)
  predictions = knn.predict(X_test)
  return(predictions)

#### K = 3

In [160]:
metrics(Y_test, knn(3, X_train, Y_train, X_test))

Accuracy = 0.805830400424995

TP rate = {'<=50K': 0.897438154767145, '>50k': 0.5245945945945946}

FP rate = {'<=50K': 0.4754054054054054, '>50k': 0.10256184523285501}

              precision    recall  f1-score   support

       <=50K       0.85      0.90      0.87     11359
        >50K       0.62      0.52      0.57      3700

    accuracy                           0.81     15059
   macro avg       0.74      0.71      0.72     15059
weighted avg       0.80      0.81      0.80     15059



#### K = 5

In [161]:
metrics(Y_test, knn(5, X_train, Y_train, X_test))

Accuracy = 0.8221661464904708

TP rate = {'<=50K': 0.9108196144026763, '>50k': 0.55}

FP rate = {'<=50K': 0.45, '>50k': 0.0891803855973237}

              precision    recall  f1-score   support

       <=50K       0.86      0.91      0.89     11359
        >50K       0.67      0.55      0.60      3700

    accuracy                           0.82     15059
   macro avg       0.76      0.73      0.74     15059
weighted avg       0.81      0.82      0.82     15059



#### K = 10

In [162]:
metrics(Y_test, knn(10, X_train, Y_train, X_test))

Accuracy = 0.8284746663125041

TP rate = {'<=50K': 0.9318601989611761, '>50k': 0.5110810810810811}

FP rate = {'<=50K': 0.4889189189189189, '>50k': 0.06813980103882385}

              precision    recall  f1-score   support

       <=50K       0.85      0.93      0.89     11359
        >50K       0.71      0.51      0.59      3700

    accuracy                           0.83     15059
   macro avg       0.78      0.72      0.74     15059
weighted avg       0.82      0.83      0.82     15059



### SVM

In [167]:
from sklearn.svm import SVC

svc = SVC(kernel='linear')
svc.fit(X_train, Y_train)
predictions = svc.predict(X_test)

metrics(Y_test, predictions)

Accuracy = 0.8434158974699515

TP rate = {'<=50K': 0.925873756492649, '>50k': 0.5902702702702702}

FP rate = {'<=50K': 0.4097297297297297, '>50k': 0.074126243507351}

              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     11359
        >50K       0.72      0.59      0.65      3700

    accuracy                           0.84     15059
   macro avg       0.80      0.76      0.77     15059
weighted avg       0.84      0.84      0.84     15059



### Neural Network

In [168]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(random_state=0, max_iter=300)
nn.fit(X_train, Y_train)
predictions = nn.predict(X_test)

metrics(Y_test, predictions)

Accuracy = 0.8201739823361445

TP rate = {'<=50K': 0.8889867065762831, '>50k': 0.6089189189189189}

FP rate = {'<=50K': 0.3910810810810811, '>50k': 0.11101329342371688}

              precision    recall  f1-score   support

       <=50K       0.87      0.89      0.88     11359
        >50K       0.64      0.61      0.62      3700

    accuracy                           0.82     15059
   macro avg       0.76      0.75      0.75     15059
weighted avg       0.82      0.82      0.82     15059

