In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import io
import warnings
warnings.filterwarnings('ignore')


# --- 1. Load the Dataset ---
data = """
ID,Age,Income,Gender,Marital Status,Buys
1,<21,High,Male,Single,No
2,<21,High,Male,Married,No
3,21-35,High,Male,Single,Yes
4,>35,Medium,Male,Single,Yes
5,>35,Low,Female,Single,Yes
6,>35,Low,Female,Married,No
7,21-35,Low,Female,Married,Yes
8,<21,Medium,Male,Single,No
9,<21,Low,Female,Married,Yes
10,>35,Medium,Female,Single,Yes
11,<21,Medium,Female,Married,Yes
12,21-35,Medium,Male,Married,Yes
13,21-35,High,Female,Single,Yes
14,>35,Medium,Male,Married,No
"""
df = pd.read_csv(io.StringIO(data)).drop('ID', axis=1)

# Separate Features (X) and Target (y)
X = df.iloc[:, :-1]  # Independent variables (Age, Income, Gender, Marital Status)
y = df.iloc[:, -1]   # Dependent variable (Buys)

# --- 2. Encode Categorical Features (Required for scikit-learn) ---
# Decision Trees require numerical input. LabelEncoder maps string categories to integers.
for column in X.columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

le_y = LabelEncoder()
y = le_y.fit_transform(y) # Encodes 'No' and 'Yes' into 0 and 1

# --- 3. Divide Dataset into Training and Testing Sets ---
# train_test_split randomly partitions the data to ensure the model's generalization ability.
# X_train, y_train: Data used to build the model (70%).
# X_test, y_test: Data used to test the model (30%, unseen data).
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3,      # 30% of the 14 samples (5 samples) for testing
    random_state=42     # Sets a fixed seed for reproducibility
)

print(f"--- Dataset Split Information (70/30) ---")
print(f"Training Samples: {len(X_train)}")
print(f"Testing Samples: {len(X_test)}")
print("-" * 50)

# --- 4. Train the Decision Tree Model ---
# We initialize the classifier. The default setting uses the Gini index for splitting.
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model using ONLY the training data.
dt_classifier.fit(X_train, y_train)

# --- 5. Evaluate the Model on the Test Set ---
# Predict the outcomes for the unseen test features.
y_pred = dt_classifier.predict(X_test)

# Calculate Accuracy: The proportion of correct predictions on the test set.
accuracy = accuracy_score(y_test, y_pred)

print(f"--- Model Evaluation (on Test Set) ---")
print(f"Test Set True Labels (y_test): {y_test}")
print(f"Test Set Predictions (y_pred): {y_pred}")
print(f"Accuracy on Test Set: {accuracy * 100:.2f}%")
print("-" * 50)

# --- 6. Predict for the specific Test Case from the Assignment ---
# Test Data: [Age < 21, Income = Low, Gender = Female, Marital Status = Married]
# Encoded values used by the model: [0, 1, 0, 0]
specific_test_case = np.array([[0, 1, 0, 0]])
prediction_encoded = dt_classifier.predict(specific_test_case)
prediction_label = le_y.inverse_transform(prediction_encoded)[0]

print(f"--- Assignment Test Case Prediction ---")
print(f"Input: Age < 21, Income Low, Gender Female, Marital Status Married")
print(f"Prediction (Buys): '{prediction_label}'")

--- Dataset Split Information (70/30) ---
Training Samples: 9
Testing Samples: 5
--------------------------------------------------
--- Model Evaluation (on Test Set) ---
Test Set True Labels (y_test): [1 1 0 1 0]
Test Set Predictions (y_pred): [1 0 1 1 1]
Accuracy on Test Set: 40.00%
--------------------------------------------------
--- Assignment Test Case Prediction ---
Input: Age < 21, Income Low, Gender Female, Marital Status Married
Prediction (Buys): 'Yes'
