<a href="https://colab.research.google.com/github/a-mufasa/CSCE-4143-Practice-Project/blob/ahmed/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CSCE 4143 Practice Project

## Download the Data

#### Get the data into a Python object and inspect the data for each column's data type and values

In [38]:
import pandas as pd
import numpy as np

data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' 
test = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
# names = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names'

# Create Train & Test DataFrames from Adult Dataset files
train_df = pd.read_csv(data, header=None, na_values=" ?")
test_df = pd.read_csv(test, skiprows=1, na_values=" ?")

# Add columns to Train DataFrames using adult.names column descriptions
columns = [
  'age',
  'workclass',
  'fnlwgt',
  'education',
  'education-num',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'capital-gain',
  'capital-loss',
  'hours-per-week',
  'native-country',
  'income',
 ]

train_df.columns = columns
test_df.columns = columns

# Print datatypes & training DataFrame
print(train_df.dtypes, end='\n----------------------------')
train_df

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object
----------------------------

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## 1)  Decision Tree and Bayesian Classifier

### Pre-processing

#### Remove records with unknown ( ?) values and continuous attributes from both train and test data sets

In [39]:
# Drop rows with null values
train_df.dropna(inplace = True)
test_df.dropna(inplace = True)

# Drop continuous attributes
cont_attributes = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

train_df = train_df.drop(cont_attributes, axis = 1)
test_df = test_df.drop(cont_attributes, axis = 1)

# Fix income column of test_df (remove unnecessary '.')
test_df['income'] = test_df['income'].str.replace(".","", regex=False)

# Print training DataFrame
train_df

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


#### One-hot encode columns

In [40]:
# Function to replace encodable features with their encoded column
def oneHotEncode(df, feature):
  dummies = pd.get_dummies(df[feature])
  new_df = pd.concat([df, dummies], axis=1).drop(feature, axis=1)
  return(new_df)

# Columns to be one-hot encoded (all of our remaining features)
encodable_columns = [
  'workclass',
  'education',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'sex',
  'native-country',
]

train_df = oneHotEncode(train_df, encodable_columns)
test_df = oneHotEncode(test_df, encodable_columns)

# Inner join to only include columns that exist in both train & test (removes 1 from train)
train_df, test_df = train_df.align(test_df, join='inner', axis=1)

#### Split DataFrames into features & targets

In [41]:
# X and Y columns for later evaluation
target = 'income'

X_train = train_df.loc[:, train_df.columns != target]
Y_train = train_df[target]

X_test = test_df.loc[:, test_df.columns != target]
Y_test = test_df[target]

### Decision Tree Classifier (Single Tree)

In [42]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
predictions = dt.predict(X_test)

cm = confusion_matrix(Y_test, predictions)
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

TPR = dict(zip(['<=50K', '>50k'], (TP/(TP+FN))))
FPR = dict(zip(['<=50K', '>50k'], (FP/(FP+TN))))

print(f"Accuracy = {accuracy_score(Y_test, predictions)}\n")
print(f"TP rate = {TPR}\n")
print(f"FP rate = {FPR}\n")
print(classification_report(Y_test, predictions))

Accuracy = 0.8114748655289196

TP rate = {'<=50K': 0.8934765384276785, '>50k': 0.5597297297297297}

FP rate = {'<=50K': 0.44027027027027027, '>50k': 0.10652346157232151}

              precision    recall  f1-score   support

       <=50K       0.86      0.89      0.88     11359
        >50K       0.63      0.56      0.59      3700

    accuracy                           0.81     15059
   macro avg       0.75      0.73      0.74     15059
weighted avg       0.81      0.81      0.81     15059



### Naïve Bayesian Classifier 

In [43]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

nb = GaussianNB()
nb.fit(X_train, Y_train)
predictions = nb.predict(X_test)

cm = confusion_matrix(Y_test, predictions)
FP = cm.sum(axis=0) - np.diag(cm)  
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

TPR = dict(zip(['<=50K', '>50k'], (TP/(TP+FN))))
FPR = dict(zip(['<=50K', '>50k'], (FP/(FP+TN))))

print(f"Accuracy = {accuracy_score(Y_test, predictions)}\n")
print(f"TP rate = {TPR}\n")
print(f"FP rate = {FPR}\n")
print(classification_report(Y_test, predictions))

Accuracy = 0.5593332890630188

TP rate = {'<=50K': 0.44211638348446164, '>50k': 0.9191891891891892}

FP rate = {'<=50K': 0.0808108108108108, '>50k': 0.5578836165155383}

              precision    recall  f1-score   support

       <=50K       0.94      0.44      0.60     11359
        >50K       0.35      0.92      0.51      3700

    accuracy                           0.56     15059
   macro avg       0.65      0.68      0.55     15059
weighted avg       0.80      0.56      0.58     15059



## 2) kNN, SVM, and Neural Network 