In [9]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Load your dataset
# Assuming your dataset is in a CSV file
df = pd.read_csv('customer_purchase_data.csv')

# For demonstration, let's create a sample dataset
# Replace this with your actual dataset loading code
np.random.seed(42)
data = {
    'Age': np.random.randint(18, 70, 100),
    'Gender': np.random.choice(['Male', 'Female'], 100),
    'Annual Income': np.random.randint(20000, 100000, 100),
    'Number of Purchases': np.random.randint(1, 20, 100),
    'Time Spent on Website': np.random.randint(5, 200, 100),
    'Discounts Availed': np.random.randint(0, 50, 100),
    'Product Category': np.random.choice(['Electronics', 'Clothing', 'Groceries'], 100),
    'Loyalty Program': np.random.choice(['Yes', 'No'], 100),
    'Purchased': np.random.randint(0, 2, 100)  # Binary target variable
}
df = pd.DataFrame(data)

# Step 1: Data Preprocessing
# Handling Missing Values
df.isnull().sum()  # Check for missing values

# Encoding Categorical Variables
categorical_features = ['Gender', 'Product Category', 'Loyalty Program']
numerical_features = ['Age', 'Annual Income', 'Number of Purchases', 'Time Spent on Website', 'Discounts Availed']

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Step 2: Exploratory Data Analysis (EDA)
# Visualize the distribution of data
sns.pairplot(df, hue='Purchased')
plt.show()

# Check feature correlations
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

# Step 3: Model Selection
# Define the models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Support Vector Machine': SVC(probability=True, random_state=42)
}

# Split the dataset into training and testing sets
X = df.drop('Purchased', axis=1)
y = df['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Training and Evaluation
results = {}
for model_name, model in models.items():
    # Create a pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    results[model_name] = {
        'accuracy': accuracy,
        'classification_report': classification_rep,
        'confusion_matrix': confusion_mat,
        'roc_auc': roc_auc
    }

# Display the results
for model_name, metrics in results.items():
    print(f"\n{model_name} Model")
    print("Accuracy:", metrics['accuracy'])
    print("\nClassification Report:\n", metrics['classification_report'])
    print("\nConfusion Matrix:\n", metrics['confusion_matrix'])
    print("ROC-AUC Score:", metrics['roc_auc'])

# Step 5: Model Tuning
# Fine-tune the best-performing model (e.g., Random Forest) using Grid Search
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestClassifier(random_state=42))]),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("\nBest Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
y_pred_best_proba = best_model.predict_proba(X_test)[:, 1]

accuracy_best = accuracy_score(y_test, y_pred_best)
classification_rep_best = classification_report(y_test, y_pred_best)
confusion_mat_best = confusion_matrix(y_test, y_pred_best)
roc_auc_best = roc_auc_score(y_test, y_pred_best_proba)

print("\nBest Model Evaluation on Test Set")
print("Accuracy:", accuracy_best)
print("\nClassification Report:\n", classification_rep_best)
print("\nConfusion Matrix:\n", confusion_mat_best)
print("ROC-AUC Score:", roc_auc_best)


FileNotFoundError: [Errno 2] No such file or directory: 'customer_purchase_data.csv'