# Importing Liabraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Convert dates to datetime format

In [3]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets

In [4]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Merge data columns

In [6]:
print(merged_data.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [7]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [8]:
print(products.columns)
print(transactions.columns)

Index(['ProductID', 'ProductName', 'Category', 'Price'], dtype='object')
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')


In [10]:
merged_data['AvgPrice'] = merged_data['TotalValue'] / merged_data['Quantity']

In [11]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'AvgPrice': 'mean',  # Use the calculated AvgPrice
    'ProductID': 'nunique'
}).rename(columns={
    'TotalValue': 'TotalSpent',
    'Quantity': 'TotalQuantity',
    'ProductID': 'UniqueProducts'
}).reset_index()

# Merge with customer information 

In [14]:
customer_profiles = customers.merge(customer_features, on='CustomerID', how='left')
customer_profiles.fillna(0, inplace=True)

# Normalize features for similarity calculation

In [15]:
scaler = StandardScaler()
features_to_scale = ['TotalSpent', 'TotalQuantity', 'AvgPrice', 'UniqueProducts']
customer_profiles[features_to_scale] = scaler.fit_transform(customer_profiles[features_to_scale])

# Compute similarity matrix

In [16]:
customer_ids = customer_profiles['CustomerID']
customer_features_scaled = customer_profiles[features_to_scale]
similarity_matrix = cosine_similarity(customer_features_scaled)

# Create a Lookalike.csv file 

In [17]:
lookalike_results = {}
for i, customer_id in enumerate(customer_ids[:20]):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    # Exclude self-similarity and sort by score
    similarity_scores = [(customer_ids[j], score) for j, score in similarity_scores if j != i]
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[:3]
    lookalike_results[customer_id] = similarity_scores

# Save results to Lookalike.csv

In [18]:
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalike_results.keys()),
    'Lookalikes': [str(value) for value in lookalike_results.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)

 # Example Output

In [19]:
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [('C0137', 0.9701731901666669), ('C0103', 0.93...
1      C0002  [('C0029', 0.9998288399426515), ('C0077', 0.99...
2      C0003  [('C0010', 0.9512746713612762), ('C0111', 0.92...
3      C0004  [('C0075', 0.997040053875391), ('C0175', 0.985...
4      C0005  [('C0130', 0.9976868953366967), ('C0128', 0.99...
