In [6]:
# ==========================
# 1. Import Library
# ==========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# ==========================
# 2. Load Dataset
# ==========================
df = pd.read_csv("train.csv")

# Misalnya target harga rumah
y = df["Tm"]   # atau ganti dengan "Tm" di dataset kamu
X = df.drop(columns=["Tm"])

# ==========================
# 3. Train / Valid Split
# ==========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================
# 4. Preprocessing
# ==========================
# Pisahkan kolom numerik & kategorikal
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

# Preprocessor: numerik → langsung, kategorikal → OneHot
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# ==========================
# 5. Buat Pipeline dengan XGBoost (FIXED)
# ==========================
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("xgb", XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        enable_categorical=True  # ADD THIS LINE
    ))
])

# ==========================
# 6. Training (FIXED - remove eval_set from fit method)
# ==========================
model.fit(X_train, y_train)

# ==========================
# 7. Evaluasi
# ==========================
y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
print(f"XGBoost MAE: {mae:.4f}")

XGBoost MAE: 34.6205


In [7]:
# ==========================
# 1. Import Library
# ==========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# ==========================
# 2. Load Dataset
# ==========================
df = pd.read_csv("train.csv")

# Misalnya target harga rumah
y = df["Tm"]
X = df.drop(columns=["Tm"])

# ==========================
# 3. Train / Valid Split
# ==========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================
# 4. Manual Preprocessing
# ==========================
# Pisahkan kolom numerik & kategorikal
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

# OneHot encoding untuk kategorikal
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_encoded = encoder.fit_transform(X_train[cat_cols])
X_valid_cat_encoded = encoder.transform(X_valid[cat_cols])

# Gabungkan dengan numerik
X_train_processed = np.hstack([X_train[num_cols].values, X_train_cat_encoded])
X_valid_processed = np.hstack([X_valid[num_cols].values, X_valid_cat_encoded])

# ==========================
# 5. Training dengan XGBoost
# ==========================
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train_processed, y_train,
    eval_set=[(X_valid_processed, y_valid)],
    verbose=100
)

# ==========================
# 6. Evaluasi
# ==========================
y_pred = model.predict(X_valid_processed)
mae = mean_absolute_error(y_valid, y_pred)
print(f"XGBoost MAE: {mae:.4f}")

[0]	validation_0-rmse:85.71075
[100]	validation_0-rmse:58.57734
[200]	validation_0-rmse:55.79374
[300]	validation_0-rmse:54.08731
[400]	validation_0-rmse:53.36911
[500]	validation_0-rmse:52.92361
[600]	validation_0-rmse:52.76553
[700]	validation_0-rmse:52.47890
[800]	validation_0-rmse:52.24709
[900]	validation_0-rmse:52.12436
[999]	validation_0-rmse:51.99552
XGBoost MAE: 34.8713


In [8]:
# ==========================
# 1. Import Library
# ==========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# ==========================
# 2. Load Dataset
# ==========================
df = pd.read_csv("train.csv")

y = df["Tm"]
X = df.drop(columns=["Tm"])

# ==========================
# 3. Train / Valid Split
# ==========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================
# 4. Preprocessing
# ==========================
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# ==========================
# 5. Preprocess data manually for eval_set
# ==========================
X_train_processed = preprocessor.fit_transform(X_train)
X_valid_processed = preprocessor.transform(X_valid)

# ==========================
# 6. Training dengan early stopping
# ==========================
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    early_stopping_rounds=50
)

model.fit(
    X_train_processed, y_train,
    eval_set=[(X_valid_processed, y_valid)],
    verbose=100
)

# ==========================
# 7. Evaluasi
# ==========================
y_pred = model.predict(X_valid_processed)
mae = mean_absolute_error(y_valid, y_pred)
print(f"XGBoost MAE: {mae:.4f}")

[0]	validation_0-rmse:85.56426
[100]	validation_0-rmse:57.62781
[200]	validation_0-rmse:54.89076
[300]	validation_0-rmse:53.62912
[400]	validation_0-rmse:52.92441
[500]	validation_0-rmse:52.50955
[600]	validation_0-rmse:52.24292
[700]	validation_0-rmse:51.99659
[800]	validation_0-rmse:51.78125
[900]	validation_0-rmse:51.69864
[999]	validation_0-rmse:51.58549
XGBoost MAE: 34.6302


In [9]:
df_test = pd.read_csv("test.csv")

# Prediksi
y_pred = model.predict(df_test)

# Buat submission file
submission = pd.DataFrame({
    "id": df_test["id"],
    "Tm": y_pred
})

submission.to_csv("submission.csv", index=False)
print("File submission.csv berhasil dibuat!")


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:SMILES: object

In [None]:
# ==========================
# 1. Import Library
# ==========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# ==========================
# 2. Load Dataset
# ==========================
df = pd.read_csv("train.csv")
y = df["Tm"]
X = df.drop(columns=["Tm"])

# ==========================
# 3. Train / Valid Split
# ==========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================
# 4. Preprocessing
# ==========================
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# ==========================
# 5. Buat Pipeline dengan XGBoost
# ==========================
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("xgb", XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        enable_categorical=True
    ))
])

# ==========================
# 6. Training
# ==========================
model.fit(X_train, y_train)

# ==========================
# 7. Evaluasi
# ==========================
y_pred_valid = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred_valid)
print(f"XGBoost MAE: {mae:.4f}")

# ==========================
# 8. PREDIKSI DATA TEST
# ==========================
df_test = pd.read_csv("test.csv")
y_pred_test = model.predict(df_test)

# Buat submission file
submission = pd.DataFrame({
    "id": df_test["id"],
    "Tm": y_pred_test
})

submission.to_csv("submission.csv", index=False)
print("File submission.csv berhasil dibuat!")
print(f"Shape submission: {submission.shape}")

XGBoost MAE: 34.6205
File submission.csv berhasil dibuat!
Shape submission: (666, 2)


In [None]:
# ==========================
# 1. IMPORT LIBRARIES
# ==========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder, RobustScaler, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from xgboost import XGBRegressor
from scipy import stats
from scipy.stats.mstats import winsorize
import warnings
warnings.filterwarnings('ignore')

# ==========================
# 2. LOAD & EXPLORE DATA
# ==========================
print("Loading data...")
df = pd.read_csv("train.csv")
print(f"Data shape: {df.shape}")

# Pastikan target exists
if 'Tm' not in df.columns:
    raise ValueError("Column 'Tm' not found in dataset!")

y = df["Tm"]
X = df.drop(columns=["Tm"])

print(f"Target stats - Min: {y.min():.2f}, Max: {y.max():.2f}, Mean: {y.mean():.2f}")

# ==========================
# 3. ADVANCED FEATURE ENGINEERING
# ==========================
print("Engineering features...")

# 3.1 Basic feature types
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print(f"Numeric columns: {len(num_cols)}")
print(f"Categorical columns: {len(cat_cols)}")

# 3.2 Create synthetic features (jika ada numeric columns)
if num_cols:
    # Polynomial features sederhana
    for i, col1 in enumerate(num_cols):
        for col2 in num_cols[i+1:]:
            X[f'{col1}_x_{col2}'] = X[col1] * X[col2]
            X[f'{col1}_div_{col2}'] = X[col1] / (X[col2] + 1e-8)  # avoid division by zero

# 3.3 Statistical features untuk categorical columns
for col in cat_cols:
    # Group statistical features
    group_stats = X.groupby(col)[num_cols].transform(['mean', 'std']).fillna(0)
    group_stats.columns = [f'{col}_{stat}_{num}' for num in num_cols for stat in ['mean', 'std']]
    X = pd.concat([X, group_stats], axis=1)

# Update columns setelah feature engineering
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print(f"After engineering - Numeric: {len(num_cols)}, Categorical: {len(cat_cols)}")

# ==========================
# 4. OUTLIER HANDLING
# ==========================
print("Handling outliers...")

# 4.1 Detect outliers menggunakan IQR method
outlier_mask = np.zeros(len(X), dtype=bool)

for col in num_cols:
    if X[col].nunique() > 5:  # Hanya untuk continuous variables
        Q1 = X[col].quantile(0.25)
        Q3 = X[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        col_outliers = (X[col] < lower_bound) | (X[col] > upper_bound)
        outlier_mask = outlier_mask | col_outliers

print(f"Found {outlier_mask.sum()} outliers ({outlier_mask.sum()/len(X)*100:.1f}%)")

# 4.2 Remove outliers
X_clean = X[~outlier_mask].copy()
y_clean = y[~outlier_mask].copy()

print(f"Data after outlier removal: {X_clean.shape}")

# ==========================
# 5. TARGET TRANSFORMATION
# ==========================
print("Transforming target...")

# Cek skewness target
skewness = stats.skew(y_clean)
print(f"Target skewness: {skewness:.3f}")

if abs(skewness) > 1:  # Jika highly skewed
    y_transformed = np.log1p(y_clean)
    use_log_transform = True
    print("Applied log transformation to target")
else:
    y_transformed = y_clean.copy()
    use_log_transform = False
    print("No transformation needed for target")

# ==========================
# 6. TRAIN-VALID SPLIT
# ==========================
X_train, X_valid, y_train, y_valid = train_test_split(
    X_clean, y_transformed, test_size=0.2, random_state=42, shuffle=True
)

print(f"Train shape: {X_train.shape}, Valid shape: {X_valid.shape}")

# ==========================
# 7. PREPROCESSING PIPELINE
# ==========================
print("Building preprocessing pipeline...")

# Preprocessor untuk numeric dan categorical
numeric_transformer = Pipeline(steps=[
    ('scaler', RobustScaler()),  # Robust terhadap outliers
    ('quantile', QuantileTransformer(output_distribution='normal'))  # Normal distribution
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

# ==========================
# 8. ADVANCED MODEL ENSEMBLE
# ==========================
print("Building ensemble model...")

# Base models
base_models = [
    ('xgb1', XGBRegressor(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )),
    ('xgb2', XGBRegressor(
        n_estimators=600,
        learning_rate=0.1,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.7,
        random_state=123
    )),
    ('rf', RandomForestRegressor(
        n_estimators=200,
        max_depth=8,
        random_state=42,
        n_jobs=-1
    )),
    ('ridge', Ridge(alpha=1.0))
]

# Final meta-model
meta_model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    random_state=42
)

# Stacking ensemble
ensemble_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    passthrough=True  # Gunakan original features juga
)

# Full pipeline dengan preprocessing
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('ensemble', ensemble_model)
])

# ==========================
# 9. TRAINING WITH CROSS-VALIDATION
# ==========================
print("Training model with cross-validation...")

# Cross-validation untuk evaluasi
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(full_pipeline, X_train, y_train, 
                           cv=kf, scoring='neg_mean_absolute_error')

print(f"Cross-validation MAE: {-cv_scores.mean():.4f} (+/- {-cv_scores.std() * 2:.4f})")

# Train final model pada seluruh training data
print("Training final model...")
full_pipeline.fit(X_train, y_train)

# ==========================
# 10. VALIDATION EVALUATION
# ==========================
print("Evaluating on validation set...")

# Predict on validation set
y_valid_pred = full_pipeline.predict(X_valid)

# Reverse transformation jika digunakan
if use_log_transform:
    y_valid_pred = np.expm1(y_valid_pred)
    y_valid_actual = np.expm1(y_valid)
else:
    y_valid_actual = y_valid

# Calculate MAE
val_mae = mean_absolute_error(y_valid_actual, y_valid_pred)
print(f"Validation MAE: {val_mae:.4f}")

'''
# ==========================
# 11. PREDICT ON TEST DATA
# ==========================
print("Making predictions on test data...")

# Load test data
df_test = pd.read_csv("test.csv")

# Apply same feature engineering ke test data
if num_cols:  # Hanya jika ada numeric columns original
    original_num_cols = [col for col in num_cols if col in df_test.columns and 
                        not any(x in col for x in ['_x_', '_div_', '_mean_', '_std_'])]
    
    for i, col1 in enumerate(original_num_cols):
        for col2 in original_num_cols[i+1:]:
            if col1 in df_test.columns and col2 in df_test.columns:
                df_test[f'{col1}_x_{col2}'] = df_test[col1] * df_test[col2]
                df_test[f'{col1}_div_{col2}'] = df_test[col1] / (df_test[col2] + 1e-8)

# Statistical features untuk test data
for col in cat_cols:
    if col in df_test.columns:
        # Gunakan stats dari training data untuk consistency
        for num_col in original_num_cols:
            if f'{col}_mean_{num_col}' in X_train.columns:
                # Untuk test data, kita perlu mapping yang konsisten
                # Di sini kita sederhanakan dengan mean global
                df_test[f'{col}_mean_{num_col}'] = X_train[num_col].mean()
                df_test[f'{col}_std_{num_col}'] = X_train[num_col].std()

# Predict
test_pred = full_pipeline.predict(df_test)

# Reverse transformation
if use_log_transform:
    test_pred = np.expm1(test_pred)

# ==========================
# 12. CREATE SUBMISSION FILE
# ==========================
print("Creating submission file...")

# Buat submission
submission = pd.DataFrame({
    "id": df_test["id"] if 'id' in df_test.columns else df_test.index,
    "Tm": test_pred
})

# Post-processing: clip predictions ke range yang reasonable
if 'Tm' in df.columns:
    tm_min, tm_max = df['Tm'].min(), df['Tm'].max()
    submission['Tm'] = submission['Tm'].clip(tm_min * 0.9, tm_max * 1.1)

submission.to_csv("submission_advanced.csv", index=False)

print("=" * 50)
print("FINAL RESULTS:")
print(f"Cross-validation MAE: {-cv_scores.mean():.4f}")
print(f"Validation MAE: {val_mae:.4f}")
print(f"Test predictions shape: {submission.shape}")
print("File saved: submission_advanced.csv")
print("=" * 50)

# ==========================
# 13. FEATURE IMPORTANCE (Optional)
# ==========================
# Untuk XGBoost feature importance
try:
    # Get feature names setelah preprocessing
    preprocessor.fit(X_train)
    feature_names = []
    
    # Numeric features
    feature_names.extend(num_cols)
    
    # Categorical features (setelah one-hot)
    if cat_cols:
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        cat_features = ohe.get_feature_names_out(cat_cols)
        feature_names.extend(cat_features)
    
    # Get feature importance dari meta-model
    meta_model = full_pipeline.named_steps['ensemble'].final_estimator_
    
    if hasattr(meta_model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': feature_names[:len(meta_model.feature_importances_)],
            'importance': meta_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print("\nTop 10 Feature Importances:")
        print(importance_df.head(10))
        
except Exception as e:
    print(f"Feature importance analysis skipped: {e}")

print("\nAdvanced pipeline completed successfully! 🚀")
'''

Loading data...
Data shape: (2662, 427)
Target stats - Min: 53.54, Max: 897.15, Mean: 278.26
Engineering features...
Numeric columns: 425
Categorical columns: 1


TypeError: unhashable type: 'list'