# CI6227 Data Mining – Assignment 1
### CI6227-2021-Assignment-1.2
### Ashish Narmen, G1901836L

In [1]:
import pandas
import numpy
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import time

### Load File

In [2]:
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                'capital-gain', 'capital-loss', 'hours-per-week', 
                'native-country', 'result']
category_column_names =['workclass', 'race', 'education','marital-status', 
                'occupation','relationship', 'sex', 'native-country', 'result'] 

def load_data_file(file_name, column_names):
  data_frame = pandas.read_csv(file_name, header=None, names=column_names)
  data_frame = data_frame.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  return data_frame

test_data_frame = load_data_file('adult.test', column_names)
train_data_frame = load_data_file('adult.data', column_names)

### Data Pre-Processing

#### Remove data entries with missing values

In [None]:
def find_invalid_values(data_frame):
  for col in data_frame.columns:
    print(data_frame[col].value_counts(dropna = False) * 100/data_frame.shape[0], '\n')
find_invalid_values(train_data_frame)

#### Results indicate 
Training Data Set

|Column | Percentage of invalid values|
|-------|-----------------------------|
|workclass| 5.638647 |
|occupation|5.660146|
|native-country|1.790486|

Test Data Set

|Column | Percentage of invalid values|
|-------|-----------------------------|
|workclass| 5.914870 |
|occupation|5.933296|
|native-country|1.682943|


#### Remove rows that contain invalid values

In [3]:
print('Row count - Training Data Set {}'.format(len(train_data_frame.index)))
print('Row count - Test Data Set {}'.format(len(test_data_frame.index)))

def remove_invalid_values_in_data_frame(data_frame):
  col_names = data_frame.columns
  for c in col_names:
    data_frame[c] = data_frame[c].replace('?', numpy.NaN)
  data_frame = data_frame.dropna(axis=0, how='any')
  return data_frame

train_data_frame = remove_invalid_values_in_data_frame(train_data_frame)
test_data_frame = remove_invalid_values_in_data_frame(test_data_frame)

print('Row count - Training Data Set {}'.format(train_data_frame.shape[0]))
print('Row count - Test Data Set {}'.format(test_data_frame.shape[0]))

Row count - Training Data Set 32561
Row count - Test Data Set 16281
Row count - Training Data Set 30162
Row count - Test Data Set 15060


#### Categorical attributes to numerical values 

In [4]:
def convert_categorical_to_numerical(data_frame, categorical_column_names):
  labelEncoder = preprocessing.LabelEncoder()
  for col in categorical_column_names:
    data_frame[col] = labelEncoder.fit_transform(data_frame[col])
  return data_frame

test_data_frame = convert_categorical_to_numerical(test_data_frame, category_column_names)
train_data_frame = convert_categorical_to_numerical(train_data_frame, category_column_names)

### Classifier

In [5]:
def classify(classifier, X_train, Y_train, X_test, Y_test):
  start_time = time.time()
  classifier.fit(X_train, Y_train)
  end_time = time.time() - start_time
  print(f'{end_time:.2f} seconds to train')
  y_pred = classifier.predict(X_test)
  classification_report = metrics.classification_report(Y_test,y_pred)
  accuracy = accuracy_score(Y_test,y_pred)*100
  print('Accuracy: {}'.format(accuracy))
  print('Precision Value\n{}'.format(classification_report))
  conf_matrix = confusion_matrix(Y_test, y_pred)
  print('Confusion Matrix:\n{}'.format(conf_matrix))

#### Decision Tree Classifier

In [7]:
def decision_tree(): 
  decision_tree_classifier = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                                max_depth=5, min_samples_leaf=5)
  X_train_dt = train_data_frame.values[:, 0:14]
  Y_train_dt = train_data_frame.values[:,14]
  X_test_dt = test_data_frame.values[:, 0:14]
  Y_test_dt = test_data_frame.values[:, 14]
  classify(decision_tree_classifier, X_train_dt, Y_train_dt, X_test_dt, Y_test_dt)

decision_tree()

0.05 seconds to train
Accuracy: 84.18990703851262
Precision Value
              precision    recall  f1-score   support

           0       0.86      0.95      0.90     11360
           1       0.77      0.51      0.61      3700

    accuracy                           0.84     15060
   macro avg       0.81      0.73      0.76     15060
weighted avg       0.83      0.84      0.83     15060

Confusion Matrix:
[[10794   566]
 [ 1815  1885]]


#### Naive Bayes Classifier

In [6]:
def naive_bayes(): 
  gnb=GaussianNB()
  X_train_nb = train_data_frame.values[:, 0:14]
  Y_train_nb = train_data_frame.values[:,14]
  X_test_nb = test_data_frame.values[:, 0:14]
  Y_test_nb = test_data_frame.values[:, 14]
  classify(gnb, X_train_nb, Y_train_nb, X_test_nb, Y_test_nb)
naive_bayes()

0.01 seconds to train
Accuracy: 78.85790172642763
Precision Value
              precision    recall  f1-score   support

           0       0.81      0.95      0.87     11360
           1       0.65      0.31      0.42      3700

    accuracy                           0.79     15060
   macro avg       0.73      0.63      0.64     15060
weighted avg       0.77      0.79      0.76     15060

Confusion Matrix:
[[10740   620]
 [ 2564  1136]]
