In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')


In [4]:
customers_df.head()


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [5]:
transactions_df.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
products_df.head()


Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [7]:
merged_df_1 = pd.merge(transactions_df, customers_df, on='CustomerID')

final_merged_df = pd.merge(merged_df_1, products_df, on='ProductID')

final_merged_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [9]:
final_merged_df.to_csv('all_merged.csv', index=False)

In [10]:
final = final_merged_df

In [None]:
final['CustomerID']

In [12]:
customer_features = final.groupby('CustomerID').agg({
    'TotalValue': 'sum',             # Total spending per customer
    'Quantity': 'mean',              # Average quantity purchased per transaction
    'Region': 'first',               # Region of the customer
    'Category': lambda x: ','.join(x.unique())  # Unique categories purchased
}).reset_index()

In [14]:
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,Region,Category
0,C0001,3354.52,2.400000,South America,"Home Decor,Electronics,Books"
1,C0002,1862.74,2.500000,Asia,"Home Decor,Clothing"
2,C0003,2725.38,3.500000,South America,"Clothing,Home Decor,Electronics"
3,C0004,5354.88,2.875000,South America,"Electronics,Home Decor,Books"
4,C0005,2034.24,2.333333,Asia,"Electronics,Home Decor"
...,...,...,...,...,...
194,C0196,4982.88,3.000000,Europe,"Home Decor,Books,Clothing"
195,C0197,1928.65,3.000000,Europe,"Home Decor,Electronics"
196,C0198,931.83,1.500000,Europe,"Clothing,Electronics"
197,C0199,1979.28,2.250000,Europe,"Electronics,Home Decor"


In [17]:
region_encoded = pd.get_dummies(customer_features['Region'], prefix = 'Region')
categories_encoded = customer_features['Category'].str.get_dummies(sep=',')

In [19]:
customer_features_encoded = pd.concat([
    customer_features[['CustomerID', 'TotalValue', 'Quantity']],
    region_encoded,
    categories_encoded
], axis=1)

In [20]:
customer_features_encoded

Unnamed: 0,CustomerID,TotalValue,Quantity,Region_Asia,Region_Europe,Region_North America,Region_South America,Books,Clothing,Electronics,Home Decor
0,C0001,3354.52,2.400000,False,False,False,True,1,0,1,1
1,C0002,1862.74,2.500000,True,False,False,False,0,1,0,1
2,C0003,2725.38,3.500000,False,False,False,True,0,1,1,1
3,C0004,5354.88,2.875000,False,False,False,True,1,0,1,1
4,C0005,2034.24,2.333333,True,False,False,False,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
194,C0196,4982.88,3.000000,False,True,False,False,1,1,0,1
195,C0197,1928.65,3.000000,False,True,False,False,0,0,1,1
196,C0198,931.83,1.500000,False,True,False,False,0,1,1,0
197,C0199,1979.28,2.250000,False,True,False,False,0,0,1,1


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
def find_similar_customers(customer_id, feature_matrix, top_n=3):
    customer_row = feature_matrix[feature_matrix['CustomerID'] == customer_id]
    if customer_row.empty:
        return f"CustomerID {customer_id} not found in the dataset."
    
    customer_features = customer_row.drop(columns=['CustomerID']).values
    feature_matrix_values = feature_matrix.drop(columns=['CustomerID']).values
    
    similarity_scores = cosine_similarity(customer_features, feature_matrix_values).flatten()
    
    
    similar_indices = similarity_scores.argsort()[-(top_n + 1):-1][::-1]
    similar_customers = feature_matrix.iloc[similar_indices]
    similar_customers['SimilarityScore'] = similarity_scores[similar_indices]
    
    return similar_customers[['CustomerID', 'SimilarityScore']]

In [27]:
def test_lookalike_model():
    print("Enter a CustomerID to find similar customers:")
    customer_id = input("CustomerID: ")
    top_n = int(input("How many similar customers to retrieve? (default 3): ") or 3)
    
    try:
        result = find_similar_customers(customer_id, customer_features_encoded, top_n)
        print(f"\nTop {top_n} similar customers for CustomerID {customer_id}:")
        print(result)
    except Exception as e:
        print(f"An error occurred: {e}")

In [32]:
import warnings
warnings.filterwarnings("ignore")

In [31]:
test_lookalike_model()


Enter a CustomerID to find similar customers:

Top 3 similar customers for CustomerID C0015:
[('C0131', 0.9999996776509467), ('C0036', 0.9999996460314183), ('C0157', 0.9999994746573896)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['SimilarityScore'] = similarity_scores[similar_indices]


In [33]:

def find_similar_customers(customer_id, feature_matrix, top_n=3):
    customer_row = feature_matrix[feature_matrix['CustomerID'] == customer_id]
    if customer_row.empty:
        return []
    
    customer_features = customer_row.drop(columns=['CustomerID']).values
    feature_matrix_values = feature_matrix.drop(columns=['CustomerID']).values

    similarity_scores = cosine_similarity(customer_features, feature_matrix_values).flatten()
    
    similar_indices = similarity_scores.argsort()[-(top_n + 1):-1][::-1]
    similar_customers = feature_matrix.iloc[similar_indices]
    similar_customers['SimilarityScore'] = similarity_scores[similar_indices]
    
    return list(zip(similar_customers['CustomerID'], similar_customers['SimilarityScore']))

lookalike_map = {}


top_20_customers = customer_features_encoded['CustomerID'][:20]


for customer_id in top_20_customers:
    lookalike_map[customer_id] = find_similar_customers(customer_id, customer_features_encoded, top_n=3)


lookalike_data = [
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
]

lookalike_df = pd.DataFrame(lookalike_data)

lookalike_df.to_csv('Lookalike.csv', index=False)


In [30]:
lookalike_df

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0152, 0.9999999922021652), (C0174, 0.999999..."
1,C0002,"[(C0159, 0.9999999940425903), (C0134, 0.999999..."
2,C0003,"[(C0129, 0.9999999926775152), (C0091, 0.999999..."
3,C0004,"[(C0012, 0.9999999997984285), (C0148, 0.999999..."
4,C0005,"[(C0140, 0.999999982160887), (C0007, 0.9999999..."
5,C0006,"[(C0169, 0.9999999748318126), (C0039, 0.999999..."
6,C0007,"[(C0005, 0.9999999773626209), (C0106, 0.999999..."
7,C0008,"[(C0194, 0.9999999973319407), (C0122, 0.999999..."
8,C0009,"[(C0198, 0.9999998751582532), (C0166, 0.999999..."
9,C0010,"[(C0132, 0.9999999504643629), (C0063, 0.999999..."
