In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import VarianceThreshold

# Read CSV file
data = pd.read_csv('train.csv')

# Handle missing values
data.dropna(inplace=True)

# Extract features and labels
X = data.drop(['id', 'target'], axis=1)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection using variance threshold
threshold = 0.1
selector = VarianceThreshold(threshold=threshold)
X_train_selected = selector.fit_transform(X_train)
X_test_selected = selector.transform(X_test)

# Create a decision tree classifier and tune the parameters
clf = DecisionTreeClassifier(max_depth=6, min_samples_leaf=4, min_samples_split=8, random_state=42)

# Train the decision tree model on the selected features
clf.fit(X_train_selected, y_train)

# Make predictions using the trained model
predictions = clf.predict(X_test_selected)

# Evaluate the model performance on the test set
accuracy = clf.score(X_test_selected, y_test)
print('Accuracy:', accuracy)

# Calculate the predicted probabilities
y_pred_proba = clf.predict_proba(X_test_selected)[:, 1]

# Calculate the AUC value
auc = roc_auc_score(y_test, y_pred_proba)
print('AUC Score:', auc)

# Output the selected features
selected_features = X.columns[selector.get_support()]
print('selected features:', selected_features)

Accuracy: 0.986575
AUC Score: 0.864371076483164
selected features: Index(['cat_0', 'cat_1', 'cat_2', 'cat_3', 'cat_4', 'cat_5', 'cat_6', 'cat_7',
       'cat_9', 'cat_10', 'cat_11', 'cat_12', 'cat_13', 'cat_14', 'cat_15',
       'cat_18', 'cat_19', 'num_0', 'num_1', 'num_2', 'num_3', 'num_4',
       'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10', 'num_11',
       'num_12', 'num_13', 'num_15', 'num_16', 'num_17', 'num_18', 'num_19',
       'num_20', 'num_21', 'num_22', 'num_23', 'num_25', 'num_26', 'num_27',
       'num_28', 'num_29', 'num_30', 'num_31', 'num_32', 'num_33', 'num_34',
       'num_35', 'num_36'],
      dtype='object')
