In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

# ------------------------------
# Step 1: Load the Cleaned Dataset
# ------------------------------
merged_data = pd.read_csv("merged_data.csv")

# Ensure Transaction_Date is parsed correctly
merged_data["Transaction_Date"] = pd.to_datetime(merged_data["Transaction_Date"], errors="coerce")

# DEBUG: Check available columns
print("âœ… Available columns:", merged_data.columns)

# ------------------------------
# Step 2: Create Recency Feature
# ------------------------------
latest_date = merged_data["Transaction_Date"].max()

# DEBUG: Check latest transaction date
print(f"ðŸ•’ Latest transaction date: {latest_date}")

if pd.isna(latest_date):
    raise ValueError("ðŸš¨ 'latest_date' is NaT (date parsing issue). Check Transaction_Date column!")

# Calculate Recency
merged_data["Recency"] = (latest_date - merged_data["Transaction_Date"]).dt.days

# ------------------------------
# Step 3: Aggregate Customer-Level Data
# ------------------------------
customer_data = merged_data.groupby("Company_ID").agg(
    Total_Spend=("Total_Cost", "sum"),
    Total_Transactions=("Transaction_ID", "count"),
    Avg_Spend=("Total_Cost", "mean"),
    Recency=("Recency", "min")
).reset_index()

# Handle customers with no transactions
customer_data["Recency"] = customer_data["Recency"].fillna(999)  # High value if no transactions

# Merge company profit data
customer_data = customer_data.merge(merged_data[["Company_ID", "Company_Profit"]].drop_duplicates(), on="Company_ID", how="left")

# ------------------------------
# Step 4: Handle Missing Values
# ------------------------------
# Define features and target variable
features = ["Company_Profit", "Recency", "Total_Spend", "Total_Transactions"]
target = "Purchased"

# Create binary target variable (1 = purchased, 0 = no purchase)
customer_data["Purchased"] = (customer_data["Total_Transactions"] > 0).astype(int)

# Define X (Features) and y (Target)
X = customer_data[features]
y = customer_data[target]

# Handle missing values using Imputer (fills NaN with column median)
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(X)  # Convert X to NumPy array after imputation

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# ------------------------------
# Step 5: Handle Class Imbalance with SMOTE
# ------------------------------
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# ------------------------------
# Step 6: Split Data into Training & Testing Sets
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# ------------------------------
# Step 7: Train Logistic Regression Model with Regularization
# ------------------------------
log_model = LogisticRegression(C=0.01)
log_model.fit(X_train, y_train)

# ------------------------------
# Step 8: Make Predictions & Evaluate Model
# ------------------------------
y_pred = log_model.predict(X_test)

# Model Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nðŸš€ Model Accuracy: {accuracy:.2f}")

# Classification Report
print("\nðŸ“Š Classification Report:")
print(classification_report(y_test, y_pred))

# ------------------------------
# Step 9: Cross-Validation
# ------------------------------
cv_scores = cross_val_score(log_model, X_resampled, y_resampled, cv=10)
print("\nðŸ“Š Cross-Validation Accuracy Scores:", cv_scores)
print(f"ðŸ“Œ Mean Cross-Validation Accuracy: {cv_scores.mean():.2f}")


âœ… Available columns: Index(['Transaction_ID', 'Company_ID', 'Product_ID', 'Quantity',
       'Transaction_Date', 'Total_Cost', 'Product_Name', 'Product_Price',
       'Revenue_per_Transaction', 'Company_Name', 'Company_Profit', 'Address'],
      dtype='object')
ðŸ•’ Latest transaction date: 2024-10-28 00:00:00


ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values