In [None]:
pip install numpy

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Set seed for reproducibility
np.random.seed(42)

# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=5, n_classes=3, n_informative=4, n_redundant=1, random_state=42)

# Convert to a DataFrame
df = pd.DataFrame(X, columns=["Customer age", "Purchase frequency", "Average spending", "Product category preference", "Browsing time"])

# Rescale and shift data to realistic values, then convert to integers
df["Customer age"] = ((df["Customer age"] - df["Customer age"].min()) / (df["Customer age"].max() - df["Customer age"].min()) * (65 - 18) + 18).round().astype(int)
df["Purchase frequency"] = ((df["Purchase frequency"] - df["Purchase frequency"].min()) / (df["Purchase frequency"].max() - df["Purchase frequency"].min()) * (30 - 1) + 1).round().astype(int)
df["Average spending"] = ((df["Average spending"] - df["Average spending"].min()) / (df["Average spending"].max() - df["Average spending"].min()) * (10000 - 100) + 100).round().astype(int)
df["Browsing time"] = ((df["Browsing time"] - df["Browsing time"].min()) / (df["Browsing time"].max() - df["Browsing time"].min()) * (180 - 5) + 5).round().astype(int)

# Convert Product category preference into categorical values
categories = ["Electronics", "Clothing", "Home & Kitchen", "Books", "Sports"]
df["Product category preference"] = np.random.choice(categories, size=len(df))

# Convert target variable to integer
df['Customer Segment'] = y.astype(int)  # 0: Low-value, 1: Medium-value, 2: High-value

# Save to CSV
df.to_csv("datasets/customer_segmentation.csv", index=False)

print("Dataset saved as customer_segmentation.csv"  )


Dataset saved as customer_segmentation.csv


In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("datasets/customer_segmentation.csv")

# Display first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())
print(df.dtypes)

   Customer age  Purchase frequency  Average spending  \
0            49                  19              3731   
1            36                  25              5137   
2            48                  14              3807   
3            47                  18              4049   
4            48                  15              3754   

  Product category preference  Browsing time  Customer Segment  
0                       Books            122                 2  
1                      Sports            101                 0  
2              Home & Kitchen            112                 1  
3                      Sports            121                 2  
4                      Sports             80                 1  
Customer age                   0
Purchase frequency             0
Average spending               0
Product category preference    0
Browsing time                  0
Customer Segment               0
dtype: int64
Customer age                    int64
Purchase frequency

In [2]:
# Check for duplicate values
df.drop_duplicates(inplace=True)

In [3]:
from sklearn.preprocessing import StandardScaler

# Select only integer columns for feature scaling
int_columns = ["Customer age", "Purchase frequency", "Average spending", "Browsing time"]
X = df[int_columns]  # Features
X_scaled = StandardScaler().fit_transform(X)

y = df['Customer Segment']  # Target variable

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.svm import SVC

# Initialize SVM classifier
svm_model = SVC(kernel='linear')  # Try different kernels like 'rbf', 'poly'

# Train the model
svm_model.fit(X_train, y_train)
y_pred_linear = svm_model.predict(X_test)

In [9]:
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train, y_train)

y_pred_rbf = svm_rbf.predict(X_test)

In [10]:
svm_sigmoid = SVC(kernel='sigmoid')
svm_sigmoid.fit(X_train, y_train)

y_pred_sigmoid = svm_sigmoid.predict(X_test)


In [11]:
svm_poly = SVC(kernel='poly', degree=3)  # Default degree = 3, can be changed
svm_poly.fit(X_train, y_train)

y_pred_poly = svm_poly.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(y_test, y_pred, kernel_type):
    print(f"\n🔹 SVM ({kernel_type} Kernel) Evaluation:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Call evaluation function for each model
evaluate_model(y_test, y_pred_linear, "Linear")
evaluate_model(y_test, y_pred_rbf, "RBF")
evaluate_model(y_test, y_pred_sigmoid, "Sigmoid")
evaluate_model(y_test, y_pred_poly, "Polynomial")


🔹 SVM (Linear Kernel) Evaluation:
Accuracy: 0.715
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.69      0.78        72
           1       0.69      0.74      0.71        68
           2       0.61      0.72      0.66        60

    accuracy                           0.71       200
   macro avg       0.73      0.72      0.72       200
weighted avg       0.73      0.71      0.72       200


🔹 SVM (RBF Kernel) Evaluation:
Accuracy: 0.78
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.79      0.79        72
           1       0.81      0.81      0.81        68
           2       0.75      0.73      0.74        60

    accuracy                           0.78       200
   macro avg       0.78      0.78      0.78       200
weighted avg       0.78      0.78      0.78       200


🔹 SVM (Sigmoid Kernel) Evaluation:
Accuracy: 0.365
Classification Report:
               pr

In [13]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(y_test, y_pred, kernel_type):
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)  # Convert to dict
    
    return {
        "kernel": kernel_type,
        "accuracy": accuracy,
        "report": report
    }

# Call evaluation function for each model
results = [
    evaluate_model(y_test, y_pred_linear, "Linear"),
    evaluate_model(y_test, y_pred_rbf, "RBF"),
    evaluate_model(y_test, y_pred_sigmoid, "Sigmoid"),
    evaluate_model(y_test, y_pred_poly, "Polynomial")
]

# 🔹 Hinglish Analysis Function
def analyze_results(results):
    best_model = max(results, key=lambda x: x["accuracy"])
    worst_model = min(results, key=lambda x: x["accuracy"])

    print("\n📊 **SVM Results Analysis in Hinglish** 📊\n")
    
    for res in results:
        kernel = res["kernel"]
        acc = res["accuracy"]
        print(f"🔹 **{kernel} Kernel:** Accuracy = {acc:.3f}")
    
    print("\n✅ **Best Model:**", best_model["kernel"], f"({best_model['accuracy']:.3f} accuracy) 🚀")
    print("❌ **Worst Model:**", worst_model["kernel"], f"({worst_model['accuracy']:.3f} accuracy) 😢\n")
    
    print(f"📌 **Final Verdict:** Agar best accuracy chahiye, to `{best_model['kernel']} Kernel` use karo!")

# Call Hinglish analysis
analyze_results(results)


📊 **SVM Results Analysis in Hinglish** 📊

🔹 **Linear Kernel:** Accuracy = 0.715
🔹 **RBF Kernel:** Accuracy = 0.780
🔹 **Sigmoid Kernel:** Accuracy = 0.365
🔹 **Polynomial Kernel:** Accuracy = 0.595

✅ **Best Model:** RBF (0.780 accuracy) 🚀
❌ **Worst Model:** Sigmoid (0.365 accuracy) 😢

📌 **Final Verdict:** Agar best accuracy chahiye, to `RBF Kernel` use karo!
