My model uses customer and product data to generate a lookalike recommendation system based on similarity scores. It preprocesses the data by aggregating transaction and customer information, one-hot encoding categorical variables, and normalizing features. A KNN model is trained on the processed data to find the top 3 most similar customers for each of the first 20 customers. The results are formatted as Map<cust_id, List<cust_id, score>> and saved in Lookalike.csv, where each row corresponds to one customer and their top 3 lookalikes with similarity scores.

In [53]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Load the datasets
download_url = f"https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
customers  = pd.read_csv(download_url)

download_url = f"https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
products  = pd.read_csv(download_url)

download_url = f"https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"
transactions = pd.read_csv(download_url)

customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce')
customers['SignupYear'] = customers['SignupDate'].dt.year
customers['SignupMonth'] = customers['SignupDate'].dt.month

# Merge the datasets
data = transactions.merge(customers, on='CustomerID')
data = data.merge(products, on='ProductID')

# One-hot encode 'Region'
data = pd.get_dummies(data, columns=['Region'], drop_first=True)

customer_summary = data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    preferred_product=('ProductID', lambda x: x.mode()[0])
).reset_index()

customer_data = customer_summary.merge(customers, on='CustomerID')

features = ['total_spending', 'transaction_count', 'avg_transaction_value', 'SignupYear', 'SignupMonth'] + \
           [col for col in customer_data.columns if 'Region_' in col]

X = customer_data[features]

X = X.fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Fit KNN model
knn = NearestNeighbors(n_neighbors=3, metric='euclidean')
knn.fit(X_scaled)

def recommend_similar_customers(user_customer_id):

    if user_customer_id not in customer_data['CustomerID'].values:
        raise ValueError(f"Customer with ID {user_customer_id} not found.")

    user_data = customer_data[customer_data['CustomerID'] == user_customer_id]
    user_features = user_data[features]

    user_features = user_features.fillna(0)

    if user_features.shape[0] == 0:
        raise ValueError("No features found for the selected user.")

    user_features_scaled = scaler.transform(user_features)

    distances, indices = knn.kneighbors(user_features_scaled, n_neighbors=3)

    recommended_customers = customer_data.iloc[indices[0]]

    recommended_customers['similarity_score'] = 1 / (distances[0] + 1e-10)

    result = list(zip(recommended_customers['CustomerID'], recommended_customers['similarity_score']))

    return result

lookalike_dict = {}

# For each of the first 20 customers (CustomerID: C0001 to C0020)
for i in range(1, 21):
    user_customer_id = f'C{i:04d}'  # Format customer IDs as C0001, C0002, ...
    try:
        similar_customers = recommend_similar_customers(user_customer_id)
        # Store the Map in the format: cust_id -> List<cust_id, score>
        lookalike_dict[user_customer_id] = similar_customers
    except ValueError as e:
        print(e)

lookalike_list = []

for cust_id, similar_customers in lookalike_dict.items():
    similar_customers_str = ', '.join([f'{similar_cust[0]}, {similar_cust[1]:.2f}' for similar_cust in similar_customers])
    lookalike_list.append([f"Map<{cust_id}, List<{similar_customers_str}>>"])

lookalike_df = pd.DataFrame(lookalike_list, columns=['Lookalike'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been generated.")


Lookalike.csv has been generated.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_customers['similarity_score'] = 1 / (distances[0] + 1e-10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_customers['similarity_score'] = 1 / (distances[0] + 1e-10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_customers['similarity_score'] = 1 / (distances[0] +