# Lookalike Model

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions.csv')

In [4]:
customers.sample(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
184,C0185,Kathleen Logan,North America,2023-10-17
11,C0012,Kevin May,South America,2024-08-07
89,C0090,Charles Hamilton,Asia,2023-10-17
167,C0168,Karen Clements MD,South America,2022-12-13
33,C0034,Dalton Perez,North America,2023-09-27


In [5]:
products.sample(5)

Unnamed: 0,ProductID,ProductName,Category,Price
12,P013,BookWorld Smartwatch,Electronics,114.2
0,P001,ActiveWear Biography,Books,169.3
42,P043,SoundWave Novel,Books,404.4
5,P006,ActiveWear Rug,Home Decor,121.32
60,P061,HomeSense Desk Lamp,Home Decor,156.96


In [6]:
transactions.sample(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
961,T00654,C0054,P098,2024-07-07 12:28:18,4,1199.72,299.93
363,T00511,C0081,P030,2024-05-26 05:56:36,3,833.58,277.86
931,T00847,C0004,P024,2024-10-24 16:28:30,1,338.66,338.66
71,T00744,C0004,P053,2024-03-11 09:25:22,4,1099.76,274.94
417,T00555,C0088,P039,2024-12-15 04:43:29,1,430.59,430.59


In [8]:
# Merging transactions data with product data
merged_data = pd.merge(transactions, products, on='ProductID', how='left')
merged_data.sample(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
663,T00336,C0147,P073,2024-11-16 15:45:25,3,80.97,26.99,ComfortLiving Laptop,Electronics,26.99
898,T00851,C0059,P028,2024-02-13 06:45:11,3,706.74,235.58,HomeSense Desk Lamp,Home Decor,235.58
185,T00421,C0047,P007,2024-01-04 07:47:38,1,420.15,420.15,SoundWave Cookbook,Books,420.15
630,T00596,C0175,P066,2024-09-30 11:36:49,1,337.22,337.22,SoundWave Textbook,Books,337.22
847,T00399,C0013,P056,2024-03-28 19:16:21,3,48.24,16.08,SoundWave Smartwatch,Electronics,16.08


In [10]:
# Aggregating transaction data to calculate features
customer_transactions = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    avg_purchase_value=('TotalValue', 'mean'),
    avg_quantity=('Quantity', 'mean'),
    most_frequent_category=('Category', lambda x: x.mode()[0])
).reset_index()
customer_transactions.sample(5)

Unnamed: 0,CustomerID,total_spend,transaction_count,avg_purchase_value,avg_quantity,most_frequent_category
171,C0172,2201.7,6,366.95,1.666667,Home Decor
184,C0186,1761.64,3,587.213333,2.333333,Electronics
101,C0102,6132.36,8,766.545,2.625,Electronics
107,C0108,4848.54,7,692.648571,2.285714,Clothing
115,C0116,2758.68,6,459.78,1.833333,Books


In [11]:
# Merging with customer profile data
merged_customer_data = pd.merge(customers, customer_transactions, on='CustomerID', how='left')
merged_customer_data.sample(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,total_spend,transaction_count,avg_purchase_value,avg_quantity,most_frequent_category
102,C0103,Jennifer Munoz,Europe,2022-05-15,2462.55,5.0,492.51,1.4,Clothing
45,C0046,Beth Cardenas,North America,2024-10-23,5627.83,7.0,803.975714,2.714286,Books
190,C0191,Samantha Gibson DVM,South America,2024-04-07,2997.97,5.0,599.594,2.2,Books
176,C0177,Julia Kelly,Asia,2024-06-01,2509.81,4.0,627.4525,2.25,Books
38,C0039,Angela Harris,South America,2024-10-13,4239.6,6.0,706.6,3.0,Electronics


In [15]:
merged_customer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CustomerID              200 non-null    object 
 1   CustomerName            200 non-null    object 
 2   Region                  200 non-null    object 
 3   SignupDate              200 non-null    object 
 4   total_spend             199 non-null    float64
 5   transaction_count       199 non-null    float64
 6   avg_purchase_value      199 non-null    float64
 7   avg_quantity            199 non-null    float64
 8   most_frequent_category  199 non-null    object 
dtypes: float64(4), object(5)
memory usage: 14.2+ KB


In [16]:
merged_customer_data.isnull().sum()

Unnamed: 0,0
CustomerID,0
CustomerName,0
Region,0
SignupDate,0
total_spend,1
transaction_count,1
avg_purchase_value,1
avg_quantity,1
most_frequent_category,1


In [17]:
merged_customer_data.fillna(0, inplace=True)
merged_customer_data.isnull().sum()

Unnamed: 0,0
CustomerID,0
CustomerName,0
Region,0
SignupDate,0
total_spend,0
transaction_count,0
avg_purchase_value,0
avg_quantity,0
most_frequent_category,0


In [21]:
# Feature Scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform(merged_customer_data[['total_spend', 'transaction_count', 'avg_purchase_value', 'avg_quantity']])
scaled_data

array([[-0.05188436,  0.        , -0.05478053, -0.20138205],
       [-0.86271433, -0.45129368, -0.9039848 , -0.0309241 ],
       [-0.393842  , -0.45129368, -0.01157526,  1.67365539],
       [ 1.03537505,  1.35388105, -0.06116966,  0.60829321],
       [-0.76949861, -0.90258736, -0.02508596, -0.31502068],
       [ 0.42264614, -0.45129368,  1.54245503,  0.82136565],
       [-0.47295849, -0.90258736,  0.72745786,  0.25317248],
       [ 0.44658328,  2.25646841, -1.06339862, -0.88321384],
       [-1.38789656, -0.90258736, -1.59442354, -2.58779334],
       [-0.94162972, -0.45129368, -1.05418528,  0.82136565],
       [ 0.15220099,  0.        ,  0.25596967,  0.13953385],
       [ 0.96818364,  0.90258736,  0.2614437 ,  0.33434294],
       [ 1.38632271,  0.90258736,  0.71621421,  0.09083158],
       [-1.70197101, -1.80517473, -1.51238002, -0.88321384],
       [-1.24604561, -1.35388105, -0.43615877, -0.88321384],
       [ 0.23069786,  0.        ,  0.37549279, -0.88321384],
       [ 0.70867979,  1.

In [20]:
# Cosine Similarity for customer transaction features
cosine_sim = cosine_similarity(scaled_data)
cosine_sim

array([[ 1.        ,  0.35168776, -0.82653884, ...,  0.7962151 ,
         0.63576233, -0.87826533],
       [ 0.35168776,  1.        ,  0.21255862, ...,  0.72972863,
         0.94207837, -0.70735565],
       [-0.82653884,  0.21255862,  1.        , ..., -0.35746082,
        -0.10349378,  0.52938606],
       ...,
       [ 0.7962151 ,  0.72972863, -0.35746082, ...,  1.        ,
         0.90506635, -0.82804335],
       [ 0.63576233,  0.94207837, -0.10349378, ...,  0.90506635,
         1.        , -0.86823868],
       [-0.87826533, -0.70735565,  0.52938606, ..., -0.82804335,
        -0.86823868,  1.        ]])

In [22]:
similarity_df = pd.DataFrame(cosine_sim, columns=merged_customer_data['CustomerID'], index=merged_customer_data['CustomerID'])
similarity_df.sample(5)

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0003,-0.826539,0.212559,1.0,3e-06,0.084073,0.423054,0.444316,-0.557605,-0.545881,0.650588,...,-0.657256,-0.385029,0.763125,-0.794755,0.415726,0.260609,0.799718,-0.357461,-0.103494,0.529386
C0038,0.163357,0.906791,0.305825,-0.225958,0.315422,-0.717313,-0.219188,0.166934,0.454464,0.903046,...,0.459594,0.566825,-0.030378,0.106088,-0.04758,-0.829607,0.506514,0.423612,0.768517,-0.610735
C0099,-0.327238,-0.881244,-0.244467,0.886918,-0.930464,0.296378,-0.52001,0.480261,-0.642462,-0.711127,...,-0.472142,-0.768357,-0.476614,0.34366,0.691502,0.408103,-0.766296,-0.806601,-0.870167,0.534267
C0098,0.701416,0.21823,-0.685356,0.233357,-0.239348,-0.868106,-0.82206,0.901612,0.554703,-0.072373,...,0.755935,0.467966,-0.905626,0.959926,-0.050016,-0.778798,-0.590815,0.282988,0.364494,-0.753601
C0111,-0.318271,0.706756,0.691021,-0.015169,0.123879,-0.359018,-0.083536,-0.022324,0.00261,0.946606,...,-0.010865,0.151047,0.24021,-0.216147,0.297401,-0.510818,0.663228,0.03217,0.438905,-0.168107


In [23]:
# Function to get the top N lookalikes based on similarity scores
def get_lookalikes(customer_id, top_n=3):
    sorted_similarities = similarity_df[customer_id].sort_values(ascending=False)
    lookalikes = sorted_similarities.drop(customer_id).head(top_n)
    return lookalikes

In [25]:
lookalike_data = []
for customer_id in merged_customer_data['CustomerID'][:20]:
    lookalikes = get_lookalikes(customer_id)
    lookalike_data.append({
        'CustomerID': customer_id,
        'Lookalikes': ', '.join(lookalikes.index),  # Mapping CustomerID to lookalikes
        'Scores': ', '.join([str(round(x, 4)) for x in lookalikes.values])  # Similarity scores
    })
lookalike_data

[{'CustomerID': 'C0001',
  'Lookalikes': 'C0164, C0103, C0137',
  'Scores': '0.9988, 0.9922, 0.9835'},
 {'CustomerID': 'C0002',
  'Lookalikes': 'C0029, C0031, C0077',
  'Scores': '0.9996, 0.9982, 0.9917'},
 {'CustomerID': 'C0003',
  'Lookalikes': 'C0176, C0144, C0073',
  'Scores': '0.9864, 0.9626, 0.9361'},
 {'CustomerID': 'C0004',
  'Lookalikes': 'C0075, C0165, C0113',
  'Scores': '0.9959, 0.9738, 0.9698'},
 {'CustomerID': 'C0005',
  'Lookalikes': 'C0131, C0130, C0150',
  'Scores': '0.9947, 0.989, 0.9848'},
 {'CustomerID': 'C0006',
  'Lookalikes': 'C0079, C0117, C0040',
  'Scores': '1.0, 0.9971, 0.9894'},
 {'CustomerID': 'C0007',
  'Lookalikes': 'C0080, C0140, C0092',
  'Scores': '0.9839, 0.9745, 0.9631'},
 {'CustomerID': 'C0008',
  'Lookalikes': 'C0084, C0090, C0179',
  'Scores': '0.9916, 0.9837, 0.9734'},
 {'CustomerID': 'C0009',
  'Lookalikes': 'C0180, C0192, C0083',
  'Scores': '0.9888, 0.9822, 0.9807'},
 {'CustomerID': 'C0010',
  'Lookalikes': 'C0142, C0151, C0111',
  'Scores': '

In [26]:
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df

Unnamed: 0,CustomerID,Lookalikes,Scores
0,C0001,"C0164, C0103, C0137","0.9988, 0.9922, 0.9835"
1,C0002,"C0029, C0031, C0077","0.9996, 0.9982, 0.9917"
2,C0003,"C0176, C0144, C0073","0.9864, 0.9626, 0.9361"
3,C0004,"C0075, C0165, C0113","0.9959, 0.9738, 0.9698"
4,C0005,"C0131, C0130, C0150","0.9947, 0.989, 0.9848"
5,C0006,"C0079, C0117, C0040","1.0, 0.9971, 0.9894"
6,C0007,"C0080, C0140, C0092","0.9839, 0.9745, 0.9631"
7,C0008,"C0084, C0090, C0179","0.9916, 0.9837, 0.9734"
8,C0009,"C0180, C0192, C0083","0.9888, 0.9822, 0.9807"
9,C0010,"C0142, C0151, C0111","0.9692, 0.9551, 0.9466"


In [27]:
lookalike_df.to_csv('Lookalike.csv', index=False)

In [28]:
# Function to take user input and return lookalikes for that customer
def get_user_lookalikes(user_customer_id, top_n=3):
    if user_customer_id not in merged_customer_data['CustomerID'].values:
        return "CustomerID not found!"
    lookalikes = get_lookalikes(user_customer_id, top_n)
    return lookalikes

In [29]:
user_customer_id = 'C0001'  # Replace with the desired customer ID
lookalikes = get_user_lookalikes(user_customer_id)
print(f"Top 3 lookalikes for {user_customer_id}:")
print(lookalikes)

Top 3 lookalikes for C0001:
CustomerID
C0164    0.998778
C0103    0.992178
C0137    0.983510
Name: C0001, dtype: float64
