In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle  # For saving the model


# Load the data
df = pd.read_csv('/Users/mathushan/Documents/fdm/ecommerce-return-prediction/services/data/processed/ecommerce_returns_synthetic_data_preprocessed.csv')


# Inspect the dataset
print("Dataset shape:", df.shape)
print("\nTarget variable distribution:")
print(df['Return_Flag_fixed'].value_counts(normalize=True))

# Separate features and target variable
X = df.drop('Return_Flag_fixed', axis=1)
y = df['Return_Flag_fixed']

# Remove leaky feature
columns_to_remove = ['Days_to_Return_filled2']
X = X.drop(columns=[col for col in columns_to_remove if col in X.columns])

# Show selected features
print("\nSelected features:", X.columns.tolist())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Random Forest with strict parameters
rf = RandomForestClassifier(
    n_estimators=10,        # Few trees
    max_depth=2,            # Shallow trees
    min_samples_split=100,  # High min split
    min_samples_leaf=50,    # High min leaf
    max_features='sqrt',
    random_state=42
)

# Train the model
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nRandom Forest Test Accuracy: {accuracy:.4f}")

# Cross-validation
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Random Forest CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Classification report
print(f"\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred))

# Save the trained model to a .pkl file
#with open('random_forest_model.pkl', 'wb') as file:
    #pickle.dump(rf, file)

#print("\nRandom Forest model saved as 'random_forest_model.pkl'")


Dataset shape: (10000, 16)

Target variable distribution:
Return_Flag_fixed
1    0.5052
0    0.4948
Name: proportion, dtype: float64

Selected features: ['Product_Category', 'Product_Price', 'Order_Quantity', 'Return_Reason', 'User_Age', 'User_Gender', 'Payment_Method', 'Shipping_Method', 'Discount_Applied', 'Total_Order_Value', 'Order_Year', 'Order_Month', 'Order_Weekday', 'User_Location_Num']

Random Forest Test Accuracy: 0.7520
Random Forest CV Accuracy: 0.8111 ± 0.0828

Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.67      0.99      0.80       990
           1       0.98      0.52      0.68      1010

    accuracy                           0.75      2000
   macro avg       0.82      0.75      0.74      2000
weighted avg       0.82      0.75      0.74      2000

