In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [16]:
customers_df = pd.read_csv("Customers.csv")  # Replace with your actual file path
transactions_df = pd.read_csv("Transactions.csv")  # Replace with your actual file path


In [17]:
print(customers_df.head())  # View first few rows of customers data
print(transactions_df.head())  # View first few rows of transactions data


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  TransactionID CustomerID ProductID      TransactionDate  Quantity   
0        T00001      C0199      P067  2024-08-25 12:38:23         1  \
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  


In [18]:
transactions_df.columns = transactions_df.columns.str.strip()  # Clean columns

In [19]:
user_item_matrix = transactions_df.pivot_table(
    index='CustomerID', 
    columns='ProductID', 
    values='TotalValue',  # Using 'TotalValue' to represent transaction amount
    aggfunc='sum', 
    fill_value=0  # Fill missing values with 0 if the customer didn't purchase the product
)

In [20]:
user_item_matrix = transactions_df.pivot_table(
    index='CustomerID', 
    columns='ProductID', 
    values='TotalValue',  # Using 'TotalValue' to represent transaction amount
    aggfunc='sum', 
    fill_value=0  # Fill missing values with 0 if the customer didn't purchase the product
)


In [21]:
scaler = StandardScaler()
user_item_matrix_scaled = scaler.fit_transform(user_item_matrix)


In [22]:
similarity_matrix = cosine_similarity(user_item_matrix_scaled)


In [23]:
similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)


In [24]:
def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    # Sort customers based on similarity score, and select the top N excluding the customer itself
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:top_n+1]
    return [(customer_id, similar_customer, score) for similar_customer, score in zip(similar_customers.index, similar_customers.values)]


In [25]:
lookalike_results = []
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_results.extend(get_top_lookalikes(customer_id, similarity_df))


In [26]:
lookalike_df = pd.DataFrame(lookalike_results, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])


In [27]:
lookalike_df.to_csv('Lookalike.csv', index=False)


In [28]:
print(lookalike_df.head())  # View first few lookalike recommendations


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0194         0.404928
1      C0001               C0104         0.374002
2      C0001               C0020         0.366609
3      C0002               C0030         0.404617
4      C0002               C0091         0.383778
