<a href="https://colab.research.google.com/github/a-mufasa/CSCE-4143-Practice-Project/blob/ahmed-2/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSCE 4143 Practice Project

## Download the Data

#### Get the data into a Python object for both parts and inspect the data for each column's data type and values

In [59]:
import pandas as pd
import numpy as np

# Create Train & Test DataFrames from Adult Dataset files
data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' 
test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'

# Part 1 DataFrames
train_df1 = pd.read_csv(data, header=None, na_values=" ?")
test_df1 = pd.read_csv(test, skiprows=1, na_values=" ?")

# Add columns to Train DataFrames using adult.names column descriptions
columns = [
  'age',
  'workclass',
  'fnlwgt',
  'education',
  'education-num',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'capital-gain',
  'capital-loss',
  'hours-per-week',
  'native-country',
  'income',
 ]

train_df1.columns = columns
test_df1.columns = columns

# Part 2-4 DataFrames
train_df2 = train_df1
test_df2 = train_df1

# Print datatypes & training DataFrame
print(train_df1.dtypes, end='\n----------------------------')
train_df1

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object
----------------------------

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## 1)  Decision Tree and Bayesian Classifier

### Data Preparation

#### Remove records with unknown ( ?) values and continuous attributes from both train and test data sets

In [60]:
# Drop rows with null values
train_df1.dropna(inplace = True)
train_df1.dropna(inplace = True)

# Drop continuous attributes
continuous_attributes = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

train_df1 = train_df1.drop(continuous_attributes, axis = 1)
test_df1 = test_df1.drop(continuous_attributes, axis = 1)

# Fix income column of test_df (remove unnecessary '.')
test_df1['income'] = test_df1['income'].str.replace(".","", regex=False)

# Print training DataFrame
train_df1

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


#### One-hot encode columns

In [61]:
# Function to replace encodable features with their encoded column
def oneHotEncode(df, feature):
  dummies = pd.get_dummies(df[feature])
  new_df = pd.concat([df, dummies], axis=1).drop(feature, axis=1)
  return(new_df)

# Columns to be one-hot encoded (multi-domain categorical attribute)
encodable_columns = [
  'workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country',
]

train_df1 = oneHotEncode(train_df1, encodable_columns)
test_df1 = oneHotEncode(test_df1, encodable_columns)

# Inner join to only include columns that exist in both train & test (removes 1 from train)
train_df1, test_df1 = train_df1.align(test_df1, join='inner', axis=1)

#### Split DataFrames into features & target

In [62]:
# X and Y columns for later evaluation
target = 'income'

X_train = train_df1.loc[:, train_df1.columns != target]
Y_train = train_df1[target]

X_test = test_df1.loc[:, test_df1.columns != target]
Y_test = test_df1[target]

#### Function for metrics

In [63]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def metrics(y_test, predictions):
  cm = confusion_matrix(y_test, predictions)
  FP = cm.sum(axis=0) - np.diag(cm)  
  FN = cm.sum(axis=1) - np.diag(cm)
  TP = np.diag(cm)
  TN = cm.sum() - (FP + FN + TP)

  TPR = dict(zip(['<=50K', '>50k'], (TP/(TP+FN))))
  FPR = dict(zip(['<=50K', '>50k'], (FP/(FP+TN))))

  print(f"Accuracy = {accuracy_score(y_test, predictions)}\n")
  print(f"TP rate = {TPR}\n")
  print(f"FP rate = {FPR}\n")
  print(classification_report(y_test, predictions))


### Decision Tree Classifier (Single Tree)

In [64]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
predictions = dt.predict(X_test)

metrics(Y_test, predictions)

Accuracy = 0.8167076167076167

TP rate = {'<=50K': 0.8976998552356442, '>50k': 0.5548621944877795}

FP rate = {'<=50K': 0.4451378055122205, '>50k': 0.1023001447643558}

              precision    recall  f1-score   support

       <=50K       0.87      0.90      0.88     12434
        >50K       0.63      0.55      0.59      3846

    accuracy                           0.82     16280
   macro avg       0.75      0.73      0.74     16280
weighted avg       0.81      0.82      0.81     16280



### Naïve Bayesian Classifier 

In [65]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, Y_train)
predictions = nb.predict(X_test)

metrics(Y_test, predictions)

Accuracy = 0.562960687960688

TP rate = {'<=50K': 0.45311243364967024, '>50k': 0.9180967238689548}

FP rate = {'<=50K': 0.08190327613104524, '>50k': 0.5468875663503298}

              precision    recall  f1-score   support

       <=50K       0.95      0.45      0.61     12434
        >50K       0.34      0.92      0.50      3846

    accuracy                           0.56     16280
   macro avg       0.64      0.69      0.56     16280
weighted avg       0.80      0.56      0.59     16280



## 2) kNN, SVM, and Neural Network 

### Data Preparation

#### Remove records with unknown ( ?) values

In [66]:
# Drop rows with null values
train_df2.dropna(inplace = True)
test_df2.dropna(inplace = True)

# Fix income column of test_df (remove unnecessary '.')
test_df2['income'] = test_df2['income'].str.replace(".","", regex=False)

# Print training DataFrame
train_df2

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


#### One-hot encode columns

In [67]:
# Function to replace encodable features with their encoded column
def oneHotEncode(df, feature):
  dummies = pd.get_dummies(df[feature])
  new_df = pd.concat([df, dummies], axis=1).drop(feature, axis=1)
  return(new_df)

# Columns to be one-hot encoded (multi-domain categorical attribute)
encodable_columns = [
  'workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country',
]

train_df2 = oneHotEncode(train_df2, encodable_columns)
test_df2 = oneHotEncode(test_df2, encodable_columns)

# Inner join to only include columns that exist in both train & test (removes 1 from train)
train_df2, test_df2 = train_df2.align(test_df2, join='inner', axis=1)

#### Transform numerical attributes to binary using their mean values

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,>50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,<=50K,0,0,1,...,0,0,0,0,0,0,0,1,0,0


#### Split DataFrames into features & target

In [69]:
# X and Y columns for later evaluation
target = 'income'

X_train = train_df2.loc[:, train_df2.columns != target]
Y_train = train_df2[target]

X_test = test_df2.loc[:, test_df2.columns != target]
Y_test = test_df2[target]