In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tf_keras import models, layers
import warnings
warnings.filterwarnings('ignore')

In [None]:

df_purchase = pd.read_csv("User_product_purchase_details_p2.csv")
df_user = pd.read_csv("user_demographics.csv")

df = pd.merge(df_purchase, df_user, on="User_ID", how="left")


In [4]:
print("Dataset shape after merge:", df.shape)

Dataset shape after merge: (550068, 12)


In [5]:
print(df.head())

   User_ID Product_ID City_Category Stay_In_Current_City_Years  \
0  1000001  P00069042             A                          2   
1  1000001  P00248942             A                          2   
2  1000001  P00087842             A                          2   
3  1000001  P00085442             A                          2   
4  1000002  P00285442             C                         4+   

   Marital_Status  Product_Category_1  Product_Category_2  Product_Category_3  \
0               0                   3                 NaN                 NaN   
1               0                   1                 6.0                14.0   
2               0                  12                 NaN                 NaN   
3               0                  12                14.0                 NaN   
4               0                   8                 NaN                 NaN   

   Purchase Gender   Age  Occupation  
0      8370      F  0-17          10  
1     15200      F  0-17          10  

In [6]:
df["High_Value_Purchase"] = (df["Purchase"] >= 10000).astype(int)

print("\nTarget distribution:")
print(df["High_Value_Purchase"].value_counts())


Target distribution:
High_Value_Purchase
0    360529
1    189539
Name: count, dtype: int64


In [7]:
df = df.drop(["Product_ID", "User_ID"], axis=1)

# Handle missing values
print("\nMissing values before filling:")
print(df.isnull().sum())

df = df.fillna(0)



Missing values before filling:
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
Gender                             0
Age                                0
Occupation                         0
High_Value_Purchase                0
dtype: int64


In [8]:
# Encode categorical variables using one-hot encoding
categorical_cols = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']

# Check which categorical columns exist in the dataframe
existing_categorical_cols = [col for col in categorical_cols if col in df.columns]

df = pd.get_dummies(df, columns=existing_categorical_cols, drop_first=True)


In [9]:
print("\nDataset shape after encoding:", df.shape)
print("\nColumn names after encoding:")
print(df.columns.tolist())


Dataset shape after encoding: (550068, 20)

Column names after encoding:
['Product_Category_1', 'Product_Category_2', 'Product_Category_3', 'Purchase', 'Occupation', 'High_Value_Purchase', 'Gender_M', 'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+', 'City_Category_B', 'City_Category_C', 'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2', 'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4+', 'Marital_Status_1']


In [10]:
# Prepare features and target
X = df.drop(["High_Value_Purchase", "Purchase"], axis=1)
y = df["High_Value_Purchase"]


In [12]:
# Train-test split (80-20)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [13]:
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


Training set size: 440054
Test set size: 110014


In [15]:
# Scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
print("\n" + "="*50)
print("LOGISTIC REGRESSION MODEL")
print("="*50)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


log = LogisticRegression(max_iter=2000, random_state=42)
log.fit(X_train_scaled, y_train)

pred_lr = log.predict(X_test_scaled)

print("\nLR Accuracy:", accuracy_score(y_test, pred_lr))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, pred_lr))
print("\nClassification Report:")
print(classification_report(y_test, pred_lr))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': np.abs(log.coef_[0])
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


LOGISTIC REGRESSION MODEL

LR Accuracy: 0.7661752140636646

Confusion Matrix:
[[64702  7404]
 [18320 19588]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.83     72106
           1       0.73      0.52      0.60     37908

    accuracy                           0.77    110014
   macro avg       0.75      0.71      0.72    110014
weighted avg       0.76      0.77      0.75    110014


Top 10 Most Important Features:
               feature  importance
0   Product_Category_1    0.845970
2   Product_Category_3    0.430037
4             Gender_M    0.110769
12     City_Category_C    0.109819
7            Age_36-45    0.076448
6            Age_26-35    0.072334
9            Age_51-55    0.064008
8            Age_46-50    0.043494
11     City_Category_B    0.036939
5            Age_18-25    0.036241


In [21]:
print("\n" + "="*50)
print("MLP NEURAL NETWORK MODEL")
print("="*50)
from tf_keras import models, layers

# Build MLP model
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile model
model.compile(
    optimizer="adam", 
    loss="binary_crossentropy", 
    metrics=["accuracy"]
)

# Display model architecture
print("\nModel Architecture:")
model.summary()

# Train model
print("\nTraining MLP...")
history = model.fit(
    X_train_scaled, 
    y_train, 
    epochs=20, 
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate on test set
loss, acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"\nMLP Test Accuracy: {acc:.4f}")
print(f"MLP Test Loss: {loss:.4f}")

# Get predictions for confusion matrix
pred_mlp = (model.predict(X_test_scaled) > 0.5).astype(int)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, pred_mlp))
print("\nClassification Report:")
print(classification_report(y_test, pred_mlp))



MLP NEURAL NETWORK MODEL




Model Architecture:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1216      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 3329 (13.00 KB)
Trainable params: 3329 (13.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________

Training MLP...
Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/2

In [22]:
print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)

lr_accuracy = accuracy_score(y_test, pred_lr)
mlp_accuracy = acc

print(f"\nLogistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"MLP Neural Network Accuracy: {mlp_accuracy:.4f}")
print(f"\nDifference: {abs(mlp_accuracy - lr_accuracy):.4f}")

if mlp_accuracy > lr_accuracy:
    print("\n✓ MLP performed better!")
    print("Reason: Neural networks can capture non-linear relationships")
    print("between features that logistic regression cannot model.")
elif lr_accuracy > mlp_accuracy:
    print("\n✓ Logistic Regression performed better!")
    print("Reason: The relationship might be primarily linear, or the")
    print("neural network may be overfitting the training data.")
else:
    print("\n✓ Both models performed equally!")
    print("Reason: The problem might have simple linear patterns that")
    print("both models can capture effectively.")

print("\n" + "="*50)



MODEL COMPARISON

Logistic Regression Accuracy: 0.7662
MLP Neural Network Accuracy: 0.8989

Difference: 0.1327

✓ MLP performed better!
Reason: Neural networks can capture non-linear relationships
between features that logistic regression cannot model.

