In [None]:
# Train product recommender on the merged dataset and save artifacts for the CLI
import os
from pathlib import Path
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, classification_report, confusion_matrix

# Paths and config
NOTEBOOK_DIR = Path.cwd()
ROOT = NOTEBOOK_DIR.parent
DATA_PATH = ROOT / 'Dataset' / 'merged_customer_data.csv'
MODELS_DIR = ROOT / 'models'
MODELS_DIR.mkdir(exist_ok=True)
TARGET_COL = 'product_category'

# Load dataset
print('Loading dataset from', DATA_PATH)
df = pd.read_csv(DATA_PATH)
print('Rows, cols:', df.shape)

# Simple feature engineering
if 'purchase_date' in df.columns:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'], errors='coerce')
    df['purchase_month'] = df['purchase_date'].dt.month.fillna(0).astype(int)
    df['purchase_day'] = df['purchase_date'].dt.day.fillna(0).astype(int)
else:
    df['purchase_month'] = 0
    df['purchase_day'] = 0

cust_id_col = next((c for c in ['customer_id_new', 'customer_id', 'customer_id_legacy'] if c in df.columns), None)
if cust_id_col is not None:
    agg = df.groupby(cust_id_col)['purchase_amount'].agg(mean_purchase_amount='mean', transaction_count='count').reset_index()
    df = df.merge(agg, on=cust_id_col, how='left')
else:
    df['mean_purchase_amount'] = df.get('purchase_amount', pd.Series(0))
    df['transaction_count'] = 1

# Ensure target exists and drop rows without it
if TARGET_COL not in df.columns:
    raise KeyError(f"Target column '{TARGET_COL}' not found. Columns: {list(df.columns)}")
df = df.dropna(subset=[TARGET_COL]).copy()

# Prepare X, y
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print('Distinct product classes:', len(label_encoder.classes_))

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# Preprocessor: auto-detect numeric/categorical
numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
for bad in ['transaction_id']:
    if bad in numeric_features: numeric_features.remove(bad)
    if bad in categorical_features: categorical_features.remove(bad)
print('Numeric features:', numeric_features)
print('Categorical features:', categorical_features)

numeric_transformer = Pipeline([('scaler', StandardScaler())])
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features)])

# Full pipeline
clf = Pipeline([('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))])
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
acc = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
ll = log_loss(y_test, y_proba)

print('\n=== PRODUCT RECOMMENDATION MODEL PERFORMANCE ===')
print(f'Accuracy : {acc:.4f}')
print(f'F1-macro : {f1_macro:.4f}')
print(f'Log Loss : {ll:.4f}')
print('\nClassification report:')
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print('\nConfusion matrix:')
print(confusion_matrix(y_test, y_pred))

# Save artifacts to models/ directory
pipeline_path = MODELS_DIR / 'product_recommendation_pipeline.joblib'
encoder_path = MODELS_DIR / 'product_label_encoder.joblib'
joblib.dump(clf, str(pipeline_path))
joblib.dump(label_encoder, str(encoder_path))
print(f'\nSaved pipeline to {pipeline_path}')
print(f'Saved label encoder to {encoder_path}')

# Helper function for CLI (loads saved artifacts)
def recommend_products_from_features(features_dict, top_k=3):
    """Load trained model and return top-k product recommendations"""
    pipe = joblib.load(str(pipeline_path))
    label_enc = joblib.load(str(encoder_path))
    X_user = pd.DataFrame([features_dict])
    proba = pipe.predict_proba(X_user)[0]
    classes = np.arange(len(proba))
    labels = label_enc.inverse_transform(classes)
    idx_sorted = np.argsort(proba)[::-1][:top_k]
    recommendations = [(labels[i], float(proba[i])) for i in idx_sorted]
    return recommendations

print('\n--- Example usage in CLI after auth:')
print('recs = recommend_products_from_features({...user features...}, top_k=3)')
print('Training complete!')

Loading dataset from c:\Users\evotech\Documents\MACHINE LEARNING PROJECTS\Group_5_Multimodal-Data-Preprocessing-Assignment\Dataset\merged_customer_data.csv
Rows, cols: (213, 12)
Distinct product classes: 5
Numeric features: ['engagement_score', 'purchase_interest_score', 'customer_id', 'customer_id_legacy', 'purchase_amount', 'customer_rating', 'purchase_month', 'purchase_day', 'mean_purchase_amount', 'transaction_count']
Categorical features: ['customer_id_new', 'social_media_platform', 'review_sentiment']
Numeric features: ['engagement_score', 'purchase_interest_score', 'customer_id', 'customer_id_legacy', 'purchase_amount', 'customer_rating', 'purchase_month', 'purchase_day', 'mean_purchase_amount', 'transaction_count']
Categorical features: ['customer_id_new', 'social_media_platform', 'review_sentiment']


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'