In [3]:
# customer_model_training.py

import os
import pandas as pd
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# Load data
df = pd.read_csv("data/Passenger_Satisfaction.csv").dropna()

# Encode categorical columns
le = LabelEncoder()
for col in ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'satisfaction']:
    df[col] = le.fit_transform(df[col])

# Features and target
X = df.drop(['id', 'satisfaction'], axis=1)
y = df['satisfaction']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Models
models = {
    "logistic_regression": LogisticRegression(max_iter=1000),
    "random_forest": RandomForestClassifier(),
    "gradient_boosting": GradientBoostingClassifier(),
    "xgboost": XGBClassifier(eval_metric='logloss'),  # 🔧 fixed: removed deprecated param
    "knn": KNeighborsClassifier()
}

# Save directory
model_dir = Path("customer_satisfaction_prediction/models")
model_dir.mkdir(parents=True, exist_ok=True)

# Save scaler (compressed)
scaler_path = model_dir / "scaler_compressed.pkl"
joblib.dump(scaler, scaler_path, compress=("xz", 3))
print(f"✅ Saved scaler - Size: {scaler_path.stat().st_size / (1024 * 1024):.2f} MB")

# Train and save compressed models
for name, model in models.items():
    print(f"\n🔧 Training and saving: {name} ...")
    model.fit(X_train, y_train)

    model_path = model_dir / f"{name}_compressed.pkl"
    joblib.dump(model, model_path, compress=("xz", 3))

    size_mb = model_path.stat().st_size / (1024 * 1024)
    print(f"✅ Saved {model_path.name} - Size: {size_mb:.2f} MB")

    if size_mb > 25:
        print(f"⚠️ Warning: {model_path.name} exceeds 25MB")

print("\n✅ All customer satisfaction models and scaler saved successfully!")


✅ Saved scaler - Size: 0.00 MB

🔧 Training and saving: logistic_regression ...
✅ Saved logistic_regression_compressed.pkl - Size: 0.00 MB

🔧 Training and saving: random_forest ...
✅ Saved random_forest_compressed.pkl - Size: 7.27 MB

🔧 Training and saving: gradient_boosting ...
✅ Saved gradient_boosting_compressed.pkl - Size: 0.05 MB

🔧 Training and saving: xgboost ...
✅ Saved xgboost_compressed.pkl - Size: 0.10 MB

🔧 Training and saving: knn ...
✅ Saved knn_compressed.pkl - Size: 1.80 MB

✅ All customer satisfaction models and scaler saved successfully!
