In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Separating target variable and features
X = train.drop(columns=['response_id', 'exit_status'])
y = train['exit_status']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Preprocessing
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Define parameter grids for hyperparameter tuning
param_grids = {
    'Decision Tree': {
        'classifier__max_depth': [5, 10, 15, None],
        'classifier__min_samples_split': [2, 5, 10],
    },
    'K-Nearest Neighbors': {
        'classifier__n_neighbors': [3, 5, 7, 9],
        'classifier__weights': ['uniform', 'distance'],
    },
    'XGBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [3, 6, 10],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
    },
    'Gradient Boosting': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 0.2],
        'classifier__max_depth': [3, 5, 7],
    }
}

# Models
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Split the training data for cross-validation
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Dictionary to store model scores and best models
model_scores = {}
best_models = {}

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop through models, tune hyperparameters with GridSearchCV, and evaluate
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    grid_search = GridSearchCV(pipeline, param_grids[model_name], scoring='f1', cv=cv, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    best_models[model_name] = best_model

    y_pred = best_model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    model_scores[model_name] = f1

    print(f"Model: {model_name}")
    print("Best F1 Score on Validation Set:", f1)
    print("Best Hyperparameters:", grid_search.best_params_)
    print(classification_report(y_val, y_pred, target_names=label_encoder.classes_))
    print("-" * 40)

# Select the best model based on F1 score
best_model_name = max(model_scores, key=model_scores.get)
final_model = best_models[best_model_name]

final_model.fit(X, y_encoded)

# Prepare test set and make predictions
X_test = test.drop(columns=['response_id'])
predictions = final_model.predict(X_test)
predictions = label_encoder.inverse_transform(predictions)

submission = pd.DataFrame({
    'response_id': test['response_id'],
    'exit_status': predictions
})
submission.to_csv('submission.csv', index=False)

print(f"Best Model: {best_model_name} with F1 Score: {model_scores[best_model_name]}")
print("Submission file created: submission.csv")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Model: Decision Tree
Best F1 Score on Validation Set: 0.7154730920997397
Best Hyperparameters: {'classifier__max_depth': 5, 'classifier__min_samples_split': 2}
              precision    recall  f1-score   support

        Left       0.68      0.75      0.72      5670
      Stayed       0.75      0.68      0.72      6253

    accuracy                           0.72     11923
   macro avg       0.72      0.72      0.72     11923
weighted avg       0.72      0.72      0.72     11923

----------------------------------------
Fitting 5 folds for each of 8 candidates, totalling 40 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Model: K-Nearest Neighbors
Best F1 Score on Validation Set: 0.699790626509905
Best Hyperparameters: {'classifier__n_neighbors': 9, 'classifier__weights': 'uniform'}
              precision    recall  f1-score   support

        Left       0.67      0.68      0.67      5670
      Stayed       0.70      0.69      0.70      6253

    accuracy                           0.69     11923
   macro avg       0.69      0.69      0.69     11923
weighted avg       0.69      0.69      0.69     11923

----------------------------------------
Fitting 5 folds for each of 27 candidates, totalling 135 fits
