In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier


# Read data
train = pd.read_csv('/Users/maryamakbarpour/Projects/Kaggle/Exploring_Mental_Health_Data/data/train.csv')
test = pd.read_csv('/Users/maryamakbarpour/Projects/Kaggle/Exploring_Mental_Health_Data/data/test.csv')

# Handle missing values
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

# Reduce memory usage (optimized types)
def reduce_mem_usage(df):
    for col in df.select_dtypes(include=['int', 'float']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

# Prepare features and target
X = train.drop(columns=['Depression'])
y = train['Depression']

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline for categorical features (One-Hot Encoding)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'  # keep the other columns unchanged
)

# Create a pipeline that first preprocesses the data then fits a Logistic Regression model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(silent=True))  # Cross-validation with logistic regression
])

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert all categorical columns to strings (if necessary)
for col in categorical_cols:
    X_train[col] = X_train[col].astype(str)
    X_val[col] = X_val[col].astype(str)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Evaluate on validation set
y_pred_proba = pipeline.predict_proba(X_val)[:, 1]  # Assuming binary classification
roc_auc = roc_auc_score(y_val, y_pred_proba, average='macro', multi_class='ovr')
print(f"ROC AUC: {roc_auc:.4f}")

# Predict on test data
y_test_pred_proba = pipeline.predict_proba(test)[:, 1]  # Get probabilities

# Binarize predictions at 0.5 threshold
df = pd.DataFrame(y_test_pred_proba, columns=['Depression'])
df['Depression'] = (df['Depression'] > 0.5).astype(int)

df['id'] = test['id']

# Save the predictions along with 'id' to CSV
df[['id', 'Depression']].to_csv('/Users/maryamakbarpour/Projects/Kaggle/Exploring_Mental_Health_Data/LightAutoML_simple_predictions.csv', index=False)


ROC AUC: 0.9770
