## CHEME 6880 Homework 2 
Net ID : AF626

### Problem 1 
Given the following data set on whether to approve credit card applications based on the
information of four features (student status, credit rating, available credit, and age group). Please
build a Decision Tree Classifier using Information Gain to choose the best attribute, and then use
the decision tree to predict the approval outcome of a new application from an unemployed, senior
applicant with excellent credit rating and high available credit. Please provide sufficient details on
your calculation process of building the classifier, step-by-step.

In [4]:
import pandas as pd
# Read problem set 1 data
data_1_3 = pd.read_csv('data/data_1-3.csv')
data_1_3

Unnamed: 0,Employment Status,Credit Rating,Available Credit,Age,Approve Application ?
0,Unemployed,Excellent,High,Young,No
1,Unemployed,Fair,High,Young,No
2,Unemployed,Excellent,High,Middle Age,Yes
3,Unemployed,Excellent,Medium,Senior,Yes
4,Employed,Excellent,Low,Senior,Yes
5,Employed,Fair,Low,Senior,No
6,Employed,Fair,Low,Middle Age,Yes
7,Unemployed,Excellent,Medium,Young,No
8,Employed,Excellent,Low,Young,Yes
9,Employed,Fair,Medium,Young,Yes


In [None]:
import numpy as np

class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def calculate_entropy(self, y):
        unique_classes, class_counts = np.unique(y, return_counts=True)
        probabilities = class_counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def calculate_information_gain(self, X, y, feature):
        initial_entropy = self.calculate_entropy(y)

        unique_values = np.unique(X[:, feature])
        new_entropy = 0

        for value in unique_values:
            subset_indices = np.where(X[:, feature] == value)
            subset_entropy = self.calculate_entropy(y[subset_indices])
            subset_weight = len(subset_indices[0]) / len(y)
            new_entropy += subset_weight * subset_entropy

        information_gain = initial_entropy - new_entropy
        return information_gain

    def find_best_split(self, X, y):
        num_features = X.shape[1]
        best_information_gain = 0
        best_feature = None

        for feature in range(num_features):
            information_gain = self.calculate_information_gain(X, y, feature)

            if information_gain > best_information_gain:
                best_information_gain = information_gain
                best_feature = feature

        return best_feature

    def fit(self, X, y, depth=0):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return {'class': np.argmax(np.bincount(y))}

        best_feature = self.find_best_split(X, y)
        if best_feature is None:
            return {'class': np.argmax(np.bincount(y))}

        unique_values = np.unique(X[:, best_feature])
        node = {'feature': best_feature, 'values': unique_values, 'children': {}}

        for value in unique_values:
            subset_indices = np.where(X[:, best_feature] == value)
            subset_X, subset_y = X[subset_indices], y[subset_indices]
            node['children'][value] = self.fit(subset_X, subset_y, depth + 1)

        return node

    def predict_instance(self, instance, tree):
        if 'class' in tree:
            return tree['class']
        feature_value = instance[tree['feature']]
        if feature_value not in tree['children']:
            return 0  # Default to class 0 if the value is not present in training data
        return self.predict_instance(instance, tree['children'][feature_value])

    def predict(self, X, tree):
        return np.array([self.predict_instance(instance, tree) for instance in X])

# Example usage with the provided dataset
data = np.array([
    ['Unemployed', 'Excellent', 'High', 'Young', 'No'],
    # ... (add the rest of the data entries)
])

# Convert categorical variables to numerical
from sklearn.preprocessing import LabelEncoder
label_encoders = [LabelEncoder() for _ in range(data.shape[1] - 1)]
for i, encoder in enumerate(label_encoders):
    data[:, i] = encoder.fit_transform(data[:, i])

X = data[:, :-1].astype(np.int32)
y = (data[:, -1] == 'Yes').astype(np.int32)

# Create and train the Decision Tree
tree_classifier = DecisionTreeClassifier(max_depth=None)
tree = tree_classifier.fit(X, y)

# Make predictions
new_application = np.array(['Unemployed', 'Excellent', 'High', 'Senior'])
for i, encoder in enumerate(label_encoders):
    new_application[i] = encoder.transform([new_application[i]])[0]
new_application = new_application.astype(np.int32)

prediction = tree_classifier.predict_instance(new_application, tree)
print(f'Prediction for the new application: {"Yes" if prediction == 1 else "No"}')


### Problem 2
Build a Naïve Bayes classifier using the training data in Problem 1, and make prediction for the
approval outcome of the same new application from an unemployed, senior applicant with
excellent credit rating and high available credit. Please provide sufficient details on your
calculation process of building the classifier, step-by-step. 

##### Solution

In [1]:
# Import required libraries
import pandas as pd

In [4]:
# Import the required data as dataframe
data_2 = pd.DataFrame(pd.read_csv('data/data_1-3.csv'))
data_2

Unnamed: 0,Employment Status,Credit Rating,Available Credit,Age,Approve Application ?
0,Unemployed,Excellent,High,Young,No
1,Unemployed,Fair,High,Young,No
2,Unemployed,Excellent,High,Middle Age,Yes
3,Unemployed,Excellent,Medium,Senior,Yes
4,Employed,Excellent,Low,Senior,Yes
5,Employed,Fair,Low,Senior,No
6,Employed,Fair,Low,Middle Age,Yes
7,Unemployed,Excellent,Medium,Young,No
8,Employed,Excellent,Low,Young,Yes
9,Employed,Fair,Medium,Young,Yes


In [6]:
# Next we calculate the overall prior probabilites
total_instances = len(data_2)
approve_yes_count = data_2[data_2['Approve Application ?'] == 'Yes'].shape[0]
approve_no_count = total_instances - approve_yes_count

p_approve_yes = approve_yes_count / total_instances
p_approve_no = approve_no_count / total_instances

print("Prior Probability of Approve = Yes:", p_approve_yes)
print("Prior Probability of Approve = No:", p_approve_no)


Prior Probability of Approve = Yes: 0.5789473684210527
Prior Probability of Approve = No: 0.42105263157894735


In [8]:
# Next we will calculate all the conditional probabilites for each combination
# Define a function to calculate conditional probabilities
def calculate_conditional_probability(feature, value, approve_value):
    count_feature_and_approve_value = data_2[(data_2[feature] == value) & (data_2['Approve Application ?'] == approve_value)].shape[0]
    count_approve_value = data_2[data_2['Approve Application ?'] == approve_value].shape[0]
    return count_feature_and_approve_value / count_approve_value

# Calculate conditional probabilities for each feature
conditional_probs = {}

for feature in data_2.columns[:-1]:
    for value in data_2[feature].unique():
        for approve_value in data_2['Approve Application ?'].unique():
            key = f"{feature} = {value} | Approve = {approve_value}"
            conditional_probs[key] = calculate_conditional_probability(feature, value, approve_value)

# Display conditional probabilities
for key, value in conditional_probs.items():
    print(key, ":", value)

Employment Status = Unemployed | Approve = No : 0.625
Employment Status = Unemployed | Approve = Yes : 0.36363636363636365
Employment Status = Employed | Approve = No : 0.375
Employment Status = Employed | Approve = Yes : 0.6363636363636364
Credit Rating = Excellent | Approve = No : 0.25
Credit Rating = Excellent | Approve = Yes : 0.6363636363636364
Credit Rating = Fair | Approve = No : 0.75
Credit Rating = Fair | Approve = Yes : 0.36363636363636365
Available Credit = High | Approve = No : 0.25
Available Credit = High | Approve = Yes : 0.2727272727272727
Available Credit = Medium | Approve = No : 0.5
Available Credit = Medium | Approve = Yes : 0.45454545454545453
Available Credit = Low | Approve = No : 0.25
Available Credit = Low | Approve = Yes : 0.2727272727272727
Age = Young | Approve = No : 0.5
Age = Young | Approve = Yes : 0.18181818181818182
Age = Middle Age | Approve = No : 0.0
Age = Middle Age | Approve = Yes : 0.5454545454545454
Age = Senior | Approve = No : 0.5
Age = Senior |

Now that we have the prior probabilites and the conditional probabilities for each case we can go ahead with the prediction for our new case

In [12]:
# Define a function to make predictions
def predict_approval(application):
    # Calculate probability for Approve = Yes
    p_approve_yes_given_application = p_approve_yes
    for feature, value in application.items():
        key = f"{feature} = {value} | Approve = Yes"
        if key in conditional_probs:
            p_approve_yes_given_application = p_approve_yes_given_application * conditional_probs[key]

    # Calculate probability for Approve = No
    p_approve_no_given_application = p_approve_no
    for feature, value in application.items():
        key = f"{feature} = {value} | Approve = No"
        if key in conditional_probs:
            p_approve_no_given_application = p_approve_no_given_application * conditional_probs[key]

    # Normalize probabilities
    total_probability = p_approve_yes_given_application + p_approve_no_given_application
    p_approve_yes_given_application = p_approve_yes_given_application / total_probability
    p_approve_no_given_application = p_approve_no_given_application / total_probability

    return {"Approve = Yes": p_approve_yes_given_application, "Approve = No": p_approve_no_given_application}

# New application data
new_application = {
    'Employment Status': 'Unemployed',
    'Credit Rating': 'Excellent',
    'Available Credit': 'High',
    'Age': 'Senior'
}

# Make prediction
prediction = predict_approval(new_application)
print("Prediction:", prediction)


Prediction: {'Approve = Yes': 0.5478633059311094, 'Approve = No': 0.45213669406889057}


From the above we can see that for an unemployed, senior applicant with excellent credit rating and high available credit, as per Naïve Bayes classifier, there is around 54 % chance that the application will get approved but there is a 45 % chance that the application might get rejected