In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from scipy import stats
from catboost import CatBoostClassifier, Pool

# Constants
ROOT_DIR = "data"
RANDOM_STATE = 110

# Load training data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

# Drop columns with only one unique value
train_data = train_data.loc[:, train_data.nunique() > 1]

# Replace 'OK' with NaN
train_data.replace("OK", np.nan, inplace=True)

# Drop duplicated columns
train_data = train_data.loc[:, ~train_data.T.duplicated()]

# Drop unneeded columns
drop_cols = [
    'Equipment_Fill1', 'Equipment_Fill2', 'Chamber Temp. Judge Value_AutoClave',
    'Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam'
]
train_data.drop(columns=drop_cols, inplace=True, errors='ignore')

# Convert specific columns to float
float_columns = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'
]
train_data[float_columns] = train_data[float_columns].astype(float)

# Feature/label split
X = train_data.drop('target', axis=1)
y = train_data['target']

# Visualize missing values
plt.figure(figsize=(12, 6))
sns.heatmap(train_data.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap")
plt.tight_layout()
plt.savefig("missing_values_heatmap.png")
plt.close()

# Z-score normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Outlier removal using Z-score
z_scores = stats.zscore(X_scaled)
filtered_indices = (np.abs(z_scores) < 3).all(axis=1)
X_filtered = X_scaled[filtered_indices]
y_filtered = y[filtered_indices]

# SMOTE for imbalance handling
smote = SMOTE(random_state=RANDOM_STATE)
X_resampled, y_resampled = smote.fit_resample(X_filtered, y_filtered)

# Remove highly correlated features
X_df = pd.DataFrame(X_resampled, columns=X.columns)
correlation_matrix = X_df.corr()

# Correlation heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.close()

high_corr_pairs = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            high_corr_pairs.add(correlation_matrix.columns[i])
X_df.drop(columns=high_corr_pairs, inplace=True)

# Combine features and target
train_data_final = pd.concat([X_df, pd.DataFrame(y_resampled, columns=['target'])], axis=1)

# Try converting columns to int where possible
features = []
for col in train_data_final.columns:
    try:
        train_data_final[col] = train_data_final[col].astype(int)
        features.append(col)
    except:
        continue

# Final training set
train_x = train_data_final[features]
train_y = train_data_final['target']

# GridSearchCV for CatBoost
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1],
    'iterations': [100, 200]
}
cat_model = CatBoostClassifier(random_state=RANDOM_STATE, verbose=0)
grid_search = GridSearchCV(cat_model, param_grid, cv=3, scoring='f1_macro')
grid_search.fit(train_x, train_y)

# Best model
model = grid_search.best_estimator_

# Feature Importance Visualization
feature_importances = model.get_feature_importance(Pool(train_x, label=train_y))
plt.figure(figsize=(12, 8))
sns.barplot(x=feature_importances, y=train_x.columns)
plt.title("CatBoost Feature Importances (Optimized)")
plt.xlabel("Importance")
plt.tight_layout()
plt.savefig("feature_importance_optimized.png")
plt.close()

# Predict on test set
# Ensure df_test_x is preloaded
for col in df_test_x.columns:
    try:
        df_test_x[col] = df_test_x[col].astype(int)
    except:
        continue

test_pred = model.predict(df_test_x)

# Prepare submission
submission = pd.read_csv("submission.csv")
submission["target"] = test_pred
submission.to_csv("submission-optimized.csv", index=False)