In [1]:
import pandas as pd

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [3]:
transactions_products = transactions.merge(products, on="ProductID", suffixes=("_transaction", "_product"))

In [4]:
transactions_products = transactions_products.drop(columns=["Price_product"])
transactions_products.rename(columns={'Price_transaction':'Price'},inplace=True)

In [5]:
transactions_customers = transactions_products.merge(customers, on="CustomerID")

In [6]:
transactions_customers

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,ProductName,Category,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,Timothy Perez,Europe,2022-03-15
...,...,...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,SoundWave Smartwatch,Electronics,Jacob Holt,South America,2022-01-22
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,SoundWave Smartwatch,Electronics,Mrs. Kimberly Wright,North America,2024-04-07
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,SoundWave Smartwatch,Electronics,Tyler Haynes,North America,2024-09-21
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,SoundWave Smartwatch,Electronics,Joshua Hamilton,Asia,2024-11-11


In [7]:
customer_features = transactions_customers.groupby("CustomerID").agg({
    "TotalValue": "sum",        
    "TransactionID": "count", 
    "Category": lambda x: x.mode()[0],  
    "Price": "mean"             
}).reset_index()

In [8]:
customer_features

Unnamed: 0,CustomerID,TotalValue,TransactionID,Category,Price
0,C0001,3354.52,5,Electronics,278.334000
1,C0002,1862.74,4,Clothing,208.920000
2,C0003,2725.38,4,Home Decor,195.707500
3,C0004,5354.88,8,Books,240.636250
4,C0005,2034.24,3,Electronics,291.603333
...,...,...,...,...,...
194,C0196,4982.88,4,Home Decor,416.992500
195,C0197,1928.65,3,Electronics,227.056667
196,C0198,931.83,2,Clothing,239.705000
197,C0199,1979.28,4,Electronics,250.610000


In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
customer_features = pd.get_dummies(customer_features, columns=["Category"])
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(customer_features.drop("CustomerID", axis=1))
similarity_matrix = cosine_similarity(normalized_features)
top_lookalikes = {}
for i, cust_id in enumerate(customer_features["CustomerID"][:20]):
    similarities = similarity_matrix[i]
    similar_indices = similarities.argsort()[-4:-1][::-1]
    similar_customers = [(customer_features["CustomerID"].iloc[j], similarities[j]) for j in similar_indices]
    top_lookalikes[cust_id] = similar_customers

In [11]:
top_lookalikes

{'C0001': [('C0069', np.float64(0.9993289015040403)),
  ('C0154', np.float64(0.9970344086605433)),
  ('C0181', np.float64(0.9967116880893808))],
 'C0002': [('C0029', np.float64(0.9999779985306463)),
  ('C0088', np.float64(0.9960203671461065)),
  ('C0062', np.float64(0.9958309055252705))],
 'C0003': [('C0178', np.float64(0.9979288188044522)),
  ('C0038', np.float64(0.9965444123271779)),
  ('C0189', np.float64(0.9960368918849791))],
 'C0004': [('C0017', np.float64(0.9990949035631695)),
  ('C0101', np.float64(0.9990242826597907)),
  ('C0173', np.float64(0.997510538437538))],
 'C0005': [('C0120', np.float64(0.9993878688167758)),
  ('C0186', np.float64(0.9987446439978199)),
  ('C0192', np.float64(0.9964497450131964))],
 'C0006': [('C0117', np.float64(0.9987221523730044)),
  ('C0135', np.float64(0.9962830912829247)),
  ('C0125', np.float64(0.9959248311706238))],
 'C0007': [('C0115', np.float64(0.9980376817569748)),
  ('C0050', np.float64(0.9976598802069614)),
  ('C0186', np.float64(0.9972057

In [12]:
lookalike_data = []
for cust_id, lookalikes in top_lookalikes.items():
    for similar_cust, score in lookalikes:
        lookalike_data.append({"CustomerID": cust_id, "SimilarCustomerID": similar_cust, "Score": score})
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [13]:
lookalike_df

Unnamed: 0,CustomerID,SimilarCustomerID,Score
0,C0001,C0069,0.999329
1,C0001,C0154,0.997034
2,C0001,C0181,0.996712
3,C0002,C0029,0.999978
4,C0002,C0088,0.99602
5,C0002,C0062,0.995831
6,C0003,C0178,0.997929
7,C0003,C0038,0.996544
8,C0003,C0189,0.996037
9,C0004,C0017,0.999095
