In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score



In [None]:
#global variables

pd.set_option('display.max_columns', None)
SEED = 42

FOLDS = 5

FILEPATH = '../data/'

In [None]:
test_df = pd.read_csv(f'{FILEPATH}test.csv')
train_df = pd.read_csv(f'{FILEPATH}train.csv')

In [None]:
def analyze_dataframe(df):
    """
    Analyze a pandas DataFrame and provide a summary of its characteristics.

    Parameters:
    df (pandas.DataFrame): The input DataFrame to analyze.

    Returns:
    None
    """
    print("DataFrame Information:")
    print("______________________")
    display(df.info(verbose=True, show_counts=True))
    print("\n")
    
    print("DataFrame Head:")
    print("______________________")
    display(df.head())
    print("\n")

    print("DataFrame Tail:")
    print("______________________")
    display(df.tail())
    print("\n")

    print("DataFrame Description:")
    print("______________________")
    display(df.describe().T)
    print("\n")

    print("Number of Null Values:")
    print("______________________")
    display(df.isnull().sum())
    print("\n")

    print("Number of Duplicated Rows:")
    print("______________________")
    display(df.duplicated().sum())
    print("\n")

    print("Number of Unique Values:")
    print("______________________")
    display(df.nunique())
    print("\n")

    print("DataFrame Shape:")
    print("______________________")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
    print("\n")
    
    print("DataFrame Columns:")
    print("______________________")
    display(df.columns)
    

analyze_dataframe(train)


In [None]:
def preprocess_data(df, num_features, scaler):
    # Normalize numerical features if num_features is not empty
    if num_features:  # Check if num_features list is not empty
        df[num_features] = scaler.fit_transform(df[num_features])
    
    df = df.drop(columns=['id'])
    return df

In [None]:
cat_features = [
    "Gender",
    "family_history_with_overweight",
    "FAVC",
    "CAEC",
    "SMOKE",
    "SCC",
    "CALC",
    "MTRANS",
]
num_features = [
    "Age",
    "Height",
    "Weight",
    "FCVC",
    "NCP",
    "CH2O",
    "FAF",
    "TUE",
]


In [None]:



X = train_df.drop(columns=['id'] + [f'Defect_{i}' for i in range(1, 8)])
y = train_df[[f'Defect_{i}' for i in range(1, 8)]]

skf = StratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=True)

catboost_model = CatBoostClassifier(
    loss_function='MultiClass', 
    verbose=100,  
    random_seed=SEED  
)

# Identify categorical features in your dataset
# This is necessary because CatBoost needs to know which columns are categorical
categorical_features_indices = [train_df.columns.get_loc(c) for c in cat_features if c in train_df]

# Perform stratified k-fold cross-validation
for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print(f"Fold: {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Fit the model
    catboost_model.fit(
        X_train, y_train,
        cat_features=categorical_features_indices, 
        eval_set=(X_val, y_val),
        early_stopping_rounds=50 
    )

    # Make predictions on the validation set
    y_pred_val = catboost_model.predict(X_val)

    # Calculate accuracy
    accuracy_val = accuracy_score(y_val, y_pred_val)
    print(f'Validation Accuracy: {accuracy_val}')
    print("\n")


In [16]:

# Define the defect categories
defect_categories = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# Split the training data into features and targets
X = train_df.drop(columns=['id'] + defect_categories)
y = train_df[defect_categories]

# Number of folds for Stratified K-Fold
n_splits = 5

# Initialize the Stratified K-Fold cross-validator
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Store the average AUC scores for each defect category
average_auc_scores = []

# Iterate over each defect category
for category in defect_categories:
    print(f"Training model for {category}...")

    # Store the AUC scores for each fold
    fold_auc_scores = []

    # Perform Stratified K-Fold cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y[category])):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx][category], y.iloc[val_idx][category]

        # Train the CatBoost classifier
        model = CatBoostClassifier(verbose=False, random_seed=42)
        model.fit(X_train, y_train)

        # Make predictions and calculate AUC for the current fold
        y_pred = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)
        fold_auc_scores.append(auc)
        print(f"  AUC for fold {fold + 1}: {auc}")

    # Calculate the average AUC across all folds for the current defect category
    average_auc = np.mean(fold_auc_scores)
    average_auc_scores.append(average_auc)
    print(f"Average AUC for {category}: {average_auc}\n")

# Calculate the overall average AUC score across all defect categories
overall_average_auc = np.mean(average_auc_scores)
print(f"Overall Average AUC Score: {overall_average_auc}")

Training model for Pastry...
  AUC for fold 1: 0.872908943594219
  AUC for fold 2: 0.8687953112280058
  AUC for fold 3: 0.8540275632591118
  AUC for fold 4: 0.8661090351633611
  AUC for fold 5: 0.8570619622169879
Average AUC for Pastry: 0.8637805630923371

Training model for Z_Scratch...
  AUC for fold 1: 0.960234354322562
  AUC for fold 2: 0.9603281922956618
  AUC for fold 3: 0.9589928057553956
  AUC for fold 4: 0.9531483843025915
  AUC for fold 5: 0.9577744617865438
Average AUC for Z_Scratch: 0.958095639692551

Training model for K_Scatch...
  AUC for fold 1: 0.9852297926317909
  AUC for fold 2: 0.9857675541038817
  AUC for fold 3: 0.9828057056728906
  AUC for fold 4: 0.9866851648724053
  AUC for fold 5: 0.9816239722731936
Average AUC for K_Scatch: 0.9844224379108324

Training model for Stains...
  AUC for fold 1: 0.9923804148440807
  AUC for fold 2: 0.9925520906824702
  AUC for fold 3: 0.9924298010441653
  AUC for fold 4: 0.9939160774472667
  AUC for fold 5: 0.990694915656362
Averag

In [None]:
import matplotlib.pyplot as plt

feature_importance = catboost_model.get_feature_importance()
feature_names = train_df.drop(columns=["NObeyesdad"]).columns

# Sort the feature importance and feature names in ascending order
sorted_indices = feature_importance.argsort()
sorted_feature_importance = feature_importance[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]

plt.figure(figsize=(10, 6))
plt.barh(sorted_feature_names, sorted_feature_importance)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('CatBoost Feature Importance (Ascending Order)')
plt.show()


# Submission

In [None]:

test_df = preprocess_data(test, num_features, scaler)
y_pred_test = catboost_model.predict(test_df)


submission_df = pd.DataFrame({
    'id': test['id'],  # Ensure this is correctly referencing the IDs from your original test dataset
    'NObeyesdad': y_pred_test.flatten()  # Flatten in case the output is in a single-column array
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

# Display the first few rows of the submission DataFrame
display(submission_df.head())
display(submission_df.shape)



In [17]:
final_models = {}
for category in defect_categories:
    print(f"Training final model for {category}...")
    model = CatBoostClassifier(verbose=False, random_seed=42)
    model.fit(X, y[category])
    final_models[category] = model

# Make predictions on the test data
test_predictions = pd.DataFrame({'id': test_df['id']})
for category in defect_categories:
    print(f"Predicting probabilities for {category}...")
    model = final_models[category]
    test_predictions[category] = model.predict_proba(test_df.drop(columns=['id']))[:, 1]

# Save the submission file
submission_file = 'submission.csv'
test_predictions.to_csv(submission_file, index=False)
print(f"\nSubmission file saved to: {submission_file}")

Training final model for Pastry...
Training final model for Z_Scratch...
Training final model for K_Scatch...
Training final model for Stains...
Training final model for Dirtiness...
Training final model for Bumps...
Training final model for Other_Faults...
Predicting probabilities for Pastry...
Predicting probabilities for Z_Scratch...
Predicting probabilities for K_Scatch...
Predicting probabilities for Stains...
Predicting probabilities for Dirtiness...
Predicting probabilities for Bumps...
Predicting probabilities for Other_Faults...

Submission file saved to: submission.csv
