In [12]:
# Import core libraries
import pandas as pd
import numpy as np

# Import preprocessing tools
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Import machine learning model and metrics
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import RandomizedSearchCV

# Import warnings to handle potential deprication warnings
import warnings
warnings.filterwarnings('ignore')

In [7]:
# Load the dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Step 1: Categorize Sleep Duration if it exists
def categorize_sleep_duration(value):
    value = str(value).lower().replace('hours', '').strip()
    if any(keyword in value for keyword in ['less than', '1-2', '2-3', '3-4', '4-5']):
        return 'Short Sleep'
    elif any(keyword in value for keyword in ['5-6', '6-7', '6-8', '7-8']):
        return 'Adequate Sleep'
    elif any(keyword in value for keyword in ['more than', '8-9', '9-10', '10-11']):
        return 'Long Sleep'
    try:
        numeric_values = [float(x) for x in value.split() if x.replace('.', '', 1).isdigit()]
        if numeric_values:
            avg_hours = sum(numeric_values) / len(numeric_values)
            if avg_hours < 5:
                return 'Short Sleep'
            elif 5 <= avg_hours <= 8:
                return 'Adequate Sleep'
            elif avg_hours > 8:
                return 'Long Sleep'
    except ValueError:
        return np.nan
    return np.nan

# Step 2: Categorize `Dietary Habits` if it exists
def categorize_dietary_habits(value):
    value = str(value).lower().strip()
    if 'healthy' in value or 'balanced' in value or 'nutritious' in value:
        return 'Healthy'
    elif 'moderate' in value:
        return 'Moderate'
    elif 'unhealthy' in value or 'junk' in value or 'fast food' in value:
        return 'Unhealthy'
    return np.nan

# Apply custom categorizations to the dataset, only if the columns exist
def apply_custom_categorizations(data):
    if 'Sleep Duration' in data.columns:
        data['Sleep Duration'] = data['Sleep Duration'].apply(categorize_sleep_duration)
    if 'Dietary Habits' in data.columns:
        data['Dietary Habits'] = data['Dietary Habits'].apply(categorize_dietary_habits)
    return data

# Step 3: Impute and Clean Data
def impute_and_clean_data(data):
    # Drop irrelevant columns
    data.drop(columns=['id', 'Name', 'City', 'Profession', 'Degree'], errors='ignore', inplace=True)
    
    # Handle columns based on `Working Professional or Student` status, if available
    if 'Working Professional or Student' in data.columns:
        data['Study Satisfaction'] = data.apply(lambda row: row['Study Satisfaction'] if row['Working Professional or Student'] == 'Student' else 0, axis=1)
        data['CGPA'] = data.apply(lambda row: row['CGPA'] if row['Working Professional or Student'] == 'Student' else 0, axis=1)
        data['Job Satisfaction'] = data.apply(lambda row: row['Job Satisfaction'] if row['Working Professional or Student'] == 'Working Professional' else 0, axis=1)
        data['Work Pressure'] = data.apply(lambda row: row['Work Pressure'] if row['Working Professional or Student'] == 'Working Professional' else 0, axis=1)
    
    # Fill numeric NaNs with the median
    numeric_columns = data.select_dtypes(include=['number']).columns
    data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())
    
    return data

# Step 4: One-Hot Encoding
def one_hot_encode(data, categorical_columns):
    # Filter only columns that exist in the data
    existing_categorical_columns = [col for col in categorical_columns if col in data.columns]
    
    if existing_categorical_columns:
        encoder = OneHotEncoder(handle_unknown='ignore', drop='first')
        encoded_data = encoder.fit_transform(data[existing_categorical_columns])
        encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(existing_categorical_columns))
        data = pd.concat([data.drop(columns=existing_categorical_columns), encoded_df], axis=1)
    else:
        print("No categorical columns found for encoding.")
    
    return data

# Step 5: Full Preprocessing Pipeline
def preprocess_data(file_path, categorical_columns):
    data = load_data(file_path)
    data = apply_custom_categorizations(data)
    data = impute_and_clean_data(data)
    data = one_hot_encode(data, categorical_columns)
    return data

# Apply the preprocessing function to the training and test datasets
categorical_columns = ['Gender', 'Working Professional or Student', 'Have you ever had suicidal thoughts ?', 
                       'Family History of Mental Illness', 'Sleep Duration', 'Dietary Habits']

# Example usage:
train_data = preprocess_data('depression_detection_train.csv', categorical_columns)
test_data = preprocess_data('depression_prediction_test.csv', categorical_columns)

In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Separate features and target variable from the training data
X_train = train_data.drop(columns=['Depression'], errors='ignore')
y_train = train_data['Depression']

# Define the model using the original parameters that performed well
xgb_model = XGBClassifier(
    subsample=0.9,
    n_estimators=300,
    max_depth=5,  # Adjust to previous best setting if known
    learning_rate=0.1,  # Adjust if your previous best learning rate is different
    gamma=0.0,  # Use the original gamma setting
    colsample_bytree=0.8,  # Use the previous colsample_bytree if known
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Evaluate on training data
y_train_pred = xgb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))

# Load the raw test data to retain the `id` column
test_data_raw = pd.read_csv('depression_prediction_test.csv')

# Preprocess the test data separately, ensuring we do not drop 'id' during preprocessing
test_data_preprocessed = preprocess_data('depression_prediction_test.csv', categorical_columns)

# Generate predictions on the preprocessed test data
test_predictions = xgb_model.predict(test_data_preprocessed)

# Prepare the output for submission using 'id' from the raw test data
submission = pd.DataFrame({
    'id': test_data_raw['id'],  # Use the 'id' column from the raw test data
    'Depression': test_predictions
})

# Save the submission file
submission.to_csv('submission_xgboost_v2.csv', index=False)

Training Accuracy: 0.9425657427149965
Training Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97    115133
           1       0.86      0.82      0.84     25567

    accuracy                           0.94    140700
   macro avg       0.91      0.90      0.90    140700
weighted avg       0.94      0.94      0.94    140700



In [15]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [1, 3, 5, 7],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [1, 1.5, 2, 3]
}

# Initialize the XGBClassifier
xgb_model = XGBClassifier(random_state=42)

# Set up RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,  # Adjust this number to search more/less combinations
    scoring='accuracy',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Perform the search on the training data
random_search.fit(X_train, y_train)

# Print the best parameters and the best score achieved
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

# Train the model with the best parameters found
best_xgb_model = random_search.best_estimator_
best_xgb_model.fit(X_train, y_train)

# Evaluate on the training data with the best model
y_train_pred = best_xgb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy with Best Model:", train_accuracy)
print("Training Classification Report with Best Model:")
print(classification_report(y_train, y_train_pred))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 400, 'min_child_weight': 7, 'max_depth': 3, 'learning_rate': 0.15, 'gamma': 0.2, 'colsample_bytree': 0.6}
Best Cross-Validation Accuracy: 0.9376830135039089
Training Accuracy with Best Model: 0.9387135749822317
Training Classification Report with Best Model:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96    115133
           1       0.84      0.81      0.83     25567

    accuracy                           0.94    140700
   macro avg       0.90      0.89      0.90    140700
weighted avg       0.94      0.94      0.94    140700

