In [1]:
from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from collections import Counter

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
df = pd.read_csv("KnowledgeBasewithDomain.csv")

In [3]:
group_counts = df['Domain'].value_counts()
print(group_counts)

Health and Medicine        59
Biology                    32
Physics and Chemistry      23
Other                      16
Business                   16
Social Science             12
Computer Science           11
Games                       3
Climate and Environment     1
Name: Domain, dtype: int64


In [4]:
df_raw = df[['Completeness', 'Conciseness', 'cor.mean', 'cov.mean', 'eigenvalues.mean', 'g_mean.mean', 'h_mean.mean', 'iq_range.mean', 
 'kurtosis.mean', 'mad.mean', 'max.mean', 'mean.mean', 'median.mean', 'min.mean', 'nr_cor_attr', 'nr_norm', 'nr_outliers', 'range.mean', 'sd.mean', 'skewness.mean', 'sparsity.mean',
 't_mean.mean', 'var.mean', 'ClassImbRatio', 'ClassOverlapPerc', 'OutlierPerc', 'attr_to_inst', 'inst_to_attr', 'nr_attr', 'nr_bin', 'nr_inst', 'nr_num','attr_conc.mean', 
 'attr_ent.mean', 'LabelIssuesPerc','nUnique', 'ena', 'snr.mean', 'cEntropy', 'Domain', 'FeatureAlgo']]


In [10]:
def evaluate_decision_tree(domain, df, random_state=2):
    df = df.copy()
    df.fillna(0, inplace=True)

    df = df[df['Domain'] == domain]
    df.drop('Domain', axis=1, inplace=True)

    df['labels_tuple'] = df.iloc[:, -1].apply(tuple)

    tuple_counts = df['labels_tuple'].value_counts()

    valid_tuples = tuple_counts[tuple_counts >= 6].index
    df_filtered = df[df['labels_tuple'].isin(valid_tuples)].copy()
    df_filtered = df_filtered.drop('labels_tuple', axis=1)

    df = df_filtered

    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)  
    
    kf = KFold(n_splits=5, shuffle=True, random_state=random_state)
    
    smote = SMOTE(random_state=33)

    X_resampled, y_resampled = smote.fit_resample(X, y)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=random_state)    
    tree_classifier = DecisionTreeClassifier(random_state=random_state)
    
    cross_val_scores = cross_val_score(tree_classifier, X_train, y_train, cv=kf, scoring='accuracy')
    error_rates = 1 - cross_val_scores
    
    tree_classifier.fit(X_train, y_train)
    y_pred = tree_classifier.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return {
        'Cross-Validation Accuracy': cross_val_scores.mean(),
        'Test Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Confusion Matrix': conf_matrix
    }


group_counts = df_raw['Domain'].value_counts()
df_raw.fillna(0,inplace=True)

top_2_domains = group_counts.nlargest(2).index.tolist()

for domain in top_2_domains:
    accuracy = evaluate_decision_tree(domain, df_raw)
    print(f"Accuracy for domain {domain}: {accuracy}")


Accuracy for domain Health and Medicine: {'Cross-Validation Accuracy': 0.6566666666666666, 'Test Accuracy': 0.6, 'Precision': 0.6723214285714285, 'Recall': 0.6, 'F1 Score': 0.5903846153846153, 'Confusion Matrix': array([[3, 1, 1],
       [5, 4, 1],
       [0, 0, 5]])}
Accuracy for domain Biology: {'Cross-Validation Accuracy': 0.8107142857142857, 'Test Accuracy': 0.9, 'Precision': 0.9142857142857143, 'Recall': 0.9, 'F1 Score': 0.8871794871794872, 'Confusion Matrix': array([[2, 0, 0],
       [0, 1, 1],
       [0, 0, 6]])}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
