In [1]:
#problem 1
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Load the data
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train decision trees with different depths
results = {'max_depth': [], 'precision': [], 'recall': [], 'f1': []}

for depth in range(1, 6):
    model = DecisionTreeClassifier(
        max_depth=depth,
        min_samples_leaf=2,
        min_samples_split=5,
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Modified this line of code, added zero_division=0
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    weighted_avg = report['weighted avg']

    results['max_depth'].append(depth)
    results['precision'].append(weighted_avg['precision'])
    results['recall'].append(weighted_avg['recall'])
    results['f1'].append(weighted_avg['f1-score'])

# Output the results
results_df = pd.DataFrame(results)
print(results_df)


   max_depth  precision    recall        f1
0          1   0.468333  0.633333  0.519540
1          2   0.969444  0.966667  0.966411
2          3   1.000000  1.000000  1.000000
3          4   1.000000  1.000000  1.000000
4          5   1.000000  1.000000  1.000000


In [2]:
#problem 2
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from collections import Counter

# 1. Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
column_names = ['id', 'clump_thickness', 'uniformity_cell_size', 'uniformity_cell_shape',
                'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
                'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']
df = pd.read_csv(url, names=column_names)

# 2. Data preprocessing
# Replace '?' in 'bare_nuclei' column with NaN, then drop rows containing NaN
df['bare_nuclei'] = df['bare_nuclei'].replace('?', np.nan)
df = df.dropna()

# Convert 'bare_nuclei' column to integer type
df['bare_nuclei'] = df['bare_nuclei'].astype(int)

# Convert 'class' column to binary classification (2: benign, 4: malignant)
df['class'] = df['class'].map({2: 0, 4: 1})

# Drop 'id' column as it's irrelevant to classification
df = df.drop('id', axis=1)

# 3. Split the dataset into training and testing sets
X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Train the decision tree
dtc = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
dtc.fit(X_train, y_train)

# 5. Get information about the first split
first_split_feature_index = dtc.tree_.feature[0]
first_split_feature_name = X.columns[first_split_feature_index]
first_split_threshold = dtc.tree_.threshold[0]

print(f"First split feature: {first_split_feature_name}")
print(f"First split threshold: {first_split_threshold}")

# 6. Calculate Entropy, Gini, Misclassification Error, and Information Gain for the first split
def calculate_entropy(y):
    """Calculate entropy"""
    counts = Counter(y)
    probabilities = [count / len(y) for count in counts.values()]
    entropy = -sum([p * np.log2(p) for p in probabilities])
    return entropy

def calculate_gini(y):
    """Calculate Gini coefficient"""
    counts = Counter(y)
    probabilities = [count / len(y) for count in counts.values()]
    gini = 1 - sum([p**2 for p in probabilities])
    return gini

def calculate_misclassification_error(y):
    """Calculate misclassification error"""
    counts = Counter(y)
    majority_class_count = max(counts.values())
    misclassification_error = 1 - (majority_class_count / len(y))
    return misclassification_error

# Dataset before the first split
entropy_before = calculate_entropy(y_train)
gini_before = calculate_gini(y_train)
misclassification_error_before = calculate_misclassification_error(y_train)

print(f"Entropy before split: {entropy_before}")
print(f"Gini before split: {gini_before}")
print(f"Misclassification error before split: {misclassification_error_before}")

# Split the dataset
left_indices = X_train[first_split_feature_name] <= first_split_threshold
right_indices = X_train[first_split_feature_name] > first_split_threshold

X_left, y_left = X_train[left_indices], y_train[left_indices]
X_right, y_right = X_train[right_indices], y_train[right_indices]

# Dataset after the first split
entropy_left = calculate_entropy(y_left)
gini_left = calculate_gini(y_left)
misclassification_error_left = calculate_misclassification_error(y_left)

entropy_right = calculate_entropy(y_right)
gini_right = calculate_gini(y_right)
misclassification_error_right = calculate_misclassification_error(y_right)

print(f"Entropy left after split: {entropy_left}")
print(f"Gini left after split: {gini_left}")
print(f"Misclassification error left after split: {misclassification_error_left}")

print(f"Entropy right after split: {entropy_right}")
print(f"Gini right after split: {gini_right}")
print(f"Misclassification error right after split: {misclassification_error_right}")

# Calculate Information Gain
information_gain_entropy = entropy_before - (len(y_left) / len(y_train) * entropy_left + len(y_right) / len(y_train) * entropy_right)
information_gain_gini = gini_before - (len(y_left) / len(y_train) * gini_left + len(y_right) / len(y_train) * gini_right)
information_gain_misclassification_error = misclassification_error_before - (len(y_left) / len(y_train) * misclassification_error_left + len(y_right) / len(y_train) * misclassification_error_right)

print(f"Information Gain (Entropy): {information_gain_entropy}")
print(f"Information Gain (Gini): {information_gain_gini}")
print(f"Information Gain (Misclassification Error): {information_gain_misclassification_error}")


First split feature: uniformity_cell_size
First split threshold: 3.5
Entropy before split: 0.9217431888789798
Gini before split: 0.44674462982090646
Misclassification error before split: 0.33682008368200833
Entropy left after split: 0.3313056130577884
Gini left after split: 0.11451516954193952
Misclassification error left after split: 0.060975609756097615
Entropy right after split: 0.32744491915447627
Gini right after split: 0.11280000000000001
Misclassification error right after split: 0.06000000000000005
Information Gain (Entropy): 0.5916490906444065
Information Gain (Gini): 0.3327676933988224
Information Gain (Misclassification Error): 0.27615062761506265


In [3]:
#problem 3
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from ucimlrepo import fetch_ucirepo

# 1. Load the dataset (using ucimlrepo)
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)
X = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# Convert y to a numpy array and flatten it
y = np.ravel(y)

# Convert string labels to numerical labels
y = np.where(y == 'M', 1, 0)

# 2. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 3. Feature scaling (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Train the original data model
dtc_original = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, random_state=42)
dtc_original.fit(X_train_scaled, y_train)  # Use scaled data
y_pred_original = dtc_original.predict(X_test_scaled) # Use scaled data

precision_original = precision_score(y_test, y_pred_original)
recall_original = recall_score(y_test, y_pred_original)
f1_original = f1_score(y_test, y_pred_original)

print("Original Data:")
print(f"Precision: {precision_original}")
print(f"Recall: {recall_original}")
print(f"F1 Score: {f1_original}")

# 5. PCA dimensionality reduction
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 6. Use 1 principal component
X_train_pca_1 = X_train_pca[:, :1]
X_test_pca_1 = X_test_pca[:, :1]

dtc_pca_1 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, random_state=42)
dtc_pca_1.fit(X_train_pca_1, y_train)
y_pred_pca_1 = dtc_pca_1.predict(X_test_pca_1)

precision_pca_1 = precision_score(y_test, y_pred_pca_1)
recall_pca_1 = recall_score(y_test, y_pred_pca_1)
f1_pca_1 = f1_score(y_test, y_pred_pca_1)

cm_pca_1 = confusion_matrix(y_test, y_pred_pca_1)
FP_pca_1 = cm_pca_1[0, 1]
TP_pca_1 = cm_pca_1[1, 1]
FPR_pca_1 = FP_pca_1 / (cm_pca_1[0, 1] + cm_pca_1[0, 0]) if (cm_pca_1[0, 1] + cm_pca_1[0, 0]) != 0 else 0
TPR_pca_1 = TP_pca_1 / (cm_pca_1[1, 1] + cm_pca_1[1, 0]) if (cm_pca_1[1, 1] + cm_pca_1[1, 0]) != 0 else 0

print("\nPCA with 1 component:")
print(f"Precision: {precision_pca_1}")
print(f"Recall: {recall_pca_1}")
print(f"F1 Score: {f1_pca_1}")
print(f"Confusion Matrix:\n{cm_pca_1}")
print(f"FP: {FP_pca_1}")
print(f"TP: {TP_pca_1}")
print(f"FPR: {FPR_pca_1}")
print(f"TPR: {TPR_pca_1}")

# 7. Use 2 principal components
X_train_pca_2 = X_train_pca[:, :2]
X_test_pca_2 = X_test_pca[:, :2]

dtc_pca_2 = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, random_state=42)
dtc_pca_2.fit(X_train_pca_2, y_train)
y_pred_pca_2 = dtc_pca_2.predict(X_test_pca_2)

precision_pca_2 = precision_score(y_test, y_pred_pca_2)
recall_pca_2 = recall_score(y_test, y_pred_pca_2)
f1_pca_2 = f1_score(y_test, y_pred_pca_2)

cm_pca_2 = confusion_matrix(y_test, y_pred_pca_2)
FP_pca_2 = cm_pca_2[0, 1]
TP_pca_2 = cm_pca_2[1, 1]
FPR_pca_2 = FP_pca_2 / (cm_pca_2[0, 1] + cm_pca_2[0, 0]) if (cm_pca_2[0, 1] + cm_pca_2[0, 0]) != 0 else 0
TPR_pca_2 = TP_pca_2 / (cm_pca_2[1, 1] + cm_pca_2[1, 0]) if (cm_pca_2[1, 1] + cm_pca_2[1, 0]) != 0 else 0

print("\nPCA with 2 components:")
print(f"Precision: {precision_pca_2}")
print(f"Recall: {recall_pca_2}")
print(f"F1 Score: {f1_pca_2}")
print(f"Confusion Matrix:\n{cm_pca_2}")
print(f"FP: {FP_pca_2}")
print(f"TP: {TP_pca_2}")
print(f"FPR: {FPR_pca_2}")
print(f"TPR: {TPR_pca_2}")

# 8. Compare the results
print("\nComparison:")
print(f"Original F1: {f1_original}, PCA 1 F1: {f1_pca_1}, PCA 2 F1: {f1_pca_2}")

# 9. Determine if using continuous data is beneficial
if f1_pca_1 > f1_original or f1_pca_2 > f1_original:
    print("Using continuous data with PCA is beneficial in this case.")
else:
    print("Using continuous data with PCA is NOT beneficial in this case.")


Original Data:
Precision: 0.9047619047619048
Recall: 0.9047619047619048
F1 Score: 0.9047619047619048

PCA with 1 component:
Precision: 0.8787878787878788
Recall: 0.9206349206349206
F1 Score: 0.8992248062015504
Confusion Matrix:
[[100   8]
 [  5  58]]
FP: 8
TP: 58
FPR: 0.07407407407407407
TPR: 0.9206349206349206

PCA with 2 components:
Precision: 0.9152542372881356
Recall: 0.8571428571428571
F1 Score: 0.8852459016393442
Confusion Matrix:
[[103   5]
 [  9  54]]
FP: 5
TP: 54
FPR: 0.046296296296296294
TPR: 0.8571428571428571

Comparison:
Original F1: 0.9047619047619048, PCA 1 F1: 0.8992248062015504, PCA 2 F1: 0.8852459016393442
Using continuous data with PCA is NOT beneficial in this case.
