In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [11]:
# Load dataset
df = pd.read_csv(r"C:\Users\Administrator\Downloads\customer_purchase_data.csv")

# Display the first few rows
print(df.head())


   Age  Gender   AnnualIncome  NumberOfPurchases  ProductCategory  \
0   40       1   66120.267939                  8                0   
1   20       1   23579.773583                  4                2   
2   27       1  127821.306432                 11                2   
3   24       1  137798.623120                 19                3   
4   31       1   99300.964220                 19                1   

   TimeSpentOnWebsite  LoyaltyProgram  DiscountsAvailed  PurchaseStatus  
0           30.568601               0                 5               1  
1           38.240097               0                 5               0  
2           31.633212               1                 0               1  
3           46.167059               0                 4               1  
4           19.823592               0                 0               1  


In [12]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values (if necessary)
df = df.dropna()

# Encode categorical variables (if applicable)
df['Gender'] = df['Gender'].astype(int)  # Example encoding if needed
df['ProductCategory'] = df['ProductCategory'].astype(int)
df['LoyaltyProgram'] = df['LoyaltyProgram'].astype(int)


Age                   0
Gender                0
AnnualIncome          0
NumberOfPurchases     0
ProductCategory       0
TimeSpentOnWebsite    0
LoyaltyProgram        0
DiscountsAvailed      0
PurchaseStatus        0
dtype: int64


In [13]:
# Select relevant features
X = df[['Age', 'AnnualIncome', 'NumberOfPurchases', 'TimeSpentOnWebsite', 'DiscountsAvailed']]
y = df['PurchaseStatus']

# Check feature types
print(X.dtypes)
print(y.dtypes)


Age                     int64
AnnualIncome          float64
NumberOfPurchases       int64
TimeSpentOnWebsite    float64
DiscountsAvailed        int64
dtype: object
int64


In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check shapes
print(f"Training Data Shape: {X_train.shape}, {y_train.shape}")
print(f"Testing Data Shape: {X_test.shape}, {y_test.shape}")


Training Data Shape: (1200, 5), (1200,)
Testing Data Shape: (300, 5), (300,)


In [15]:
# Initialize Decision Tree Classifier
model = DecisionTreeClassifier()

# Train the model
model.fit(X_train, y_train)


In [16]:
# Make predictions
y_pred = model.predict(X_test)


In [17]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.77      0.85      0.81       172
           1       0.77      0.66      0.71       128

    accuracy                           0.77       300
   macro avg       0.77      0.76      0.76       300
weighted avg       0.77      0.77      0.77       300



In [18]:
# Improved Decision Tree with Hyperparameters
model_tuned = DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42)
model_tuned.fit(X_train, y_train)

# Predict again
y_pred_tuned = model_tuned.predict(X_test)

# New evaluation metrics
print(f"Tuned Model Accuracy: {accuracy_score(y_test, y_pred_tuned):.2f}")
print(classification_report(y_test, y_pred_tuned))


Tuned Model Accuracy: 0.83
              precision    recall  f1-score   support

           0       0.79      0.95      0.86       172
           1       0.90      0.66      0.77       128

    accuracy                           0.83       300
   macro avg       0.85      0.81      0.81       300
weighted avg       0.84      0.83      0.82       300

