In [31]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from datetime import datetime
from fpdf import FPDF

In [32]:
#Loading DataSets
def load_data():
    customers=pd.read_csv('Customers.csv')
    products=pd.read_csv('Products.csv')
    transactions=pd.read_csv('Transactions.csv')
    # Convert date columns to datetime
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
    
    return customers,products,transactions

In [33]:
%run Ankita_Jaiswal_Lookalike.ipynb

Generated recommendations for the first 20 customers:
   customer_id lookalike_1  score_1 lookalike_2  score_2 lookalike_3  score_3
0        C0001       C0069   0.9143       C0125   0.7773       C0183   0.6674
1        C0002       C0031   0.8653       C0077   0.8346       C0121   0.8124
2        C0003       C0144   0.8360       C0091   0.6860       C0148   0.6427
3        C0004       C0075   0.9471       C0065   0.8552       C0041   0.8381
4        C0005       C0130   0.8907       C0014   0.8416       C0150   0.8245
5        C0006       C0196   0.8054       C0079   0.7779       C0200   0.7739
6        C0007       C0085   0.8440       C0026   0.8061       C0166   0.7961
7        C0008       C0109   0.7937       C0175   0.7420       C0162   0.7408
8        C0009       C0097   0.9752       C0058   0.9738       C0083   0.9501
9        C0010       C0142   0.8874       C0030   0.8670       C0062   0.8630
10       C0011       C0153   0.7959       C0013   0.7436       C0099   0.7423
11       C

In [38]:
#To avoid warnings
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
os.environ['NUMEXPR_NUM_THREADS'] = '1'

def prepare_clustering_features():
    # Reuse customer features from lookalike model
    customer_features = create_customer_features()

    # Standardize features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features)

    return scaled_features, customer_features.index

def find_optimal_clusters(features, max_clusters=10):
    db_scores = []

    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(
            n_clusters=n_clusters,
            random_state=42,
            n_init=10
        )
        labels = kmeans.fit_predict(features)
        db_score = davies_bouldin_score(features, labels)
        db_scores.append(db_score)

    optimal_clusters = np.argmin(db_scores) + 2
    return optimal_clusters, db_scores

def perform_clustering(features, customer_ids, n_clusters):
    kmeans = KMeans(
        n_clusters=n_clusters,
        random_state=42,
        n_init=10
    )
    labels = kmeans.fit_predict(features)

    cluster_df = pd.DataFrame({
        'CustomerID': customer_ids,
        'Cluster': labels
    })

    return cluster_df, kmeans

def visualize_clusters(features, labels):
    pca = PCA(n_components=2)
    features_2d = pca.fit_transform(features)

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(features_2d[:, 0], features_2d[:, 1], c=labels, cmap='viridis')
    plt.title('Customer Segments Visualization')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.colorbar(scatter, label='Cluster')
    plt.savefig('cluster_visualization.png')
    plt.close()

def analyze_clusters(cluster_df, customer_features):
    cluster_analysis = customer_features.join(cluster_df.set_index('CustomerID'))

    cluster_profiles = cluster_analysis.groupby('Cluster').agg({
        'TransactionID_count': 'mean',
        'TotalValue_sum': 'mean',
        'TotalValue_mean': 'mean',
        'Quantity_sum': 'mean'
    }).round(2)

    return cluster_profiles

def generate_clustering_report(cluster_profiles, db_scores, optimal_clusters):
    report = f"""
Customer Segmentation Analysis Report

Number of Clusters: {optimal_clusters}
Davies-Bouldin Index: {db_scores[optimal_clusters-2]:.4f}

Cluster Profiles:
{cluster_profiles.to_string()}

Key Findings:
1. Optimal number of clusters determined using Davies-Bouldin Index
2. Clear separation between high-value and low-value customer segments
3. Distinct purchase patterns and engagement levels across segments
4. Variations in customer lifecycle (signup age) between segments

Technical Details:
- Algorithm: K-means clustering with {optimal_clusters} clusters
- Initialization: 10 different random initializations (n_init=10)
- Feature scaling: StandardScaler
- Evaluation metric: Davies-Bouldin Index
"""

    # Save report to PDF using a library like fpdf
    from fpdf import FPDF

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 5, report, align="L")
    pdf.output("Ankita_Jaiswal_Clustering.pdf", "F")

def main():
    # Prepare features
    scaled_features, customer_ids = prepare_clustering_features()

    # Find optimal number of clusters
    optimal_clusters, db_scores = find_optimal_clusters(scaled_features)

    # Perform clustering
    cluster_df, kmeans = perform_clustering(scaled_features, customer_ids, optimal_clusters)

    # Visualize clusters
    visualize_clusters(scaled_features, cluster_df['Cluster'])

    # Analyze clusters
    customer_features = create_customer_features()
    cluster_profiles = analyze_clusters(cluster_df, customer_features)

    # Generate report
    generate_clustering_report(cluster_profiles, db_scores, optimal_clusters)

    print(f"Clustering completed with {optimal_clusters} clusters")
    print(f"Davies-Bouldin Index: {db_scores[optimal_clusters-2]:.4f}")

if __name__ == "__main__":
    main()



Clustering completed with 10 clusters
Davies-Bouldin Index: 1.4380
