In [2]:
#importing libraries
import warnings
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from IPython.display import HTML



In [3]:
#loading our dataset
def download_data(url, output):
    gdown.download(url, output, quiet=False)

# Google Drive link for the merged_data.csv
file_url = "https://drive.google.com/file/d/1Oa62mOYcNhQEycOxyBFyrPkSfScA20zn/view?usp=sharing"
file_output = "merged_data.csv"  # Corrected: Assign the filename as a string

# Convert to a direct download link
file_id = file_url.split('/d/')[1].split('/view')[0]
download_url = f"https://drive.google.com/uc?id={file_id}"

# Check if the file already exists
if not os.path.isfile(file_output):
    print(f"Downloading {file_output}...")
    download_data(download_url, file_output)
else:
    print(f"{file_output} already exists. Skipping download.")

print("All datasets processed.")

merged_data.csv already exists. Skipping download.
All datasets processed.


In [4]:
# Suppress unnecessary warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load the dataset
merged_data = pd.read_csv("merged_data.csv")

# Feature Engineering: Add Recency, Frequency, and Monetary Value
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
latest_date = merged_data['TransactionDate'].max()
merged_data['Recency'] = (latest_date - merged_data['TransactionDate']).dt.days
frequency_data = merged_data.groupby('CustomerID')['TransactionID'].count().reset_index()
frequency_data.rename(columns={'TransactionID': 'Frequency'}, inplace=True)
merged_data = merged_data.merge(frequency_data, on='CustomerID', how='left')
merged_data['MonetaryValue'] = merged_data['TotalValue']

# Select relevant features for similarity calculation
features = [
    'Quantity', 'TotalValue', 'AvgPricePerItem', 'CustomerLifetimeValue',
    'Recency', 'Frequency', 'MonetaryValue'
]

# Data preprocessing: Group data by CustomerID
customer_profiles = merged_data.groupby('CustomerID')[features].mean().reset_index()

# Fill missing values (if any) with zeros
customer_profiles.fillna(0, inplace=True)

# Normalize the feature matrix
scaler = StandardScaler()
feature_matrix = scaler.fit_transform(customer_profiles[features])

# Apply PCA for dimensionality reduction
pca = PCA(n_components=3)  # Reduce to 3 components for better clustering
pca_matrix = pca.fit_transform(feature_matrix)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(pca_matrix)

# Prepare to store lookalikes
lookalike_map = {}

# Find top 3 lookalikes for each customer
average_top3_similarity = []  # Track average similarity for top 3 recommendations
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    
    # Exclude the customer themselves and sort by similarity
    similarity_scores = sorted(
        [(i, score) for i, score in similarity_scores if i != idx],
        key=lambda x: x[1], reverse=True
    )
    
    # Get the top 3 most similar customers
    top_3 = similarity_scores[:3]
    
    # Calculate average similarity for the top 3 recommendations
    average_top3_similarity.append(np.mean([score for _, score in top_3]))
    
    # Map customer_id to the top 3 lookalike customer IDs and their scores
    lookalike_map[customer_id] = [(customer_profiles['CustomerID'].iloc[i]) for i, score in top_3]

# Create Lookalike.csv for customers C0001 to C0020
lookalike_list = []
for customer_id in customer_profiles['CustomerID']:
    if customer_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]:
        lookalike_list.append({
            'CustomerID': customer_id,
            'Lookalikes': lookalike_map[customer_id]
        })

# Convert to DataFrame and save to CSV
lookalike_df = pd.DataFrame({
    'CustomerID': [row['CustomerID'] for row in lookalike_list],
    'Lookalike_Map': [row['Lookalikes'] for row in lookalike_list]
})

# Cluster the feature matrix for silhouette score calculation
n_clusters = 5  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(pca_matrix)

# Evaluate quality of recommendations
avg_similarity = np.mean(average_top3_similarity)
similarity_variance = np.var(average_top3_similarity)
silhouette_avg = silhouette_score(pca_matrix, cluster_labels)

# Print evaluation metrics
print("Evaluation Metrics:")
print(f"- Average Top-3 Similarity: {avg_similarity:.4f}")
print(f"- Variance in Similarity Scores: {similarity_variance:.4f}")
print(f"- Silhouette Score: {silhouette_avg:.4f}")

# Save the results to a CSV file
output_dir = '/mnt/data'
output_path = os.path.join(output_dir, 'Lookalike.csv')

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

lookalike_df.to_csv(output_path, index=False)
print(f"Lookalike.csv created at: {output_path}")

Evaluation Metrics:
- Average Top-3 Similarity: 0.9814
- Variance in Similarity Scores: 0.0003
- Silhouette Score: 0.2698
Lookalike.csv created at: /mnt/data\Lookalike.csv


In [5]:
#downloading csv file to local computer

# Save the merged_data to a CSV file
csv_filename = "lookalike.csv"
lookalike_df.to_csv(csv_filename, index=False)

# Generate a download link for the CSV file
def create_download_link(filename):
    """
    Creates a downloadable link for a given file.
        HTML: HTML link to download the file.
    """
    return HTML(f'<a href="{filename}" download>{filename} (Click to Download)</a>')

# Display the download link
create_download_link(csv_filename)