In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

In [3]:
# Load the datasets
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions.csv')

In [4]:
# Merge the data to associate transactions with customer and product information
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

In [5]:
# Feature Engineering

# 1. Create customer-level transaction features
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    num_products_bought=('ProductID', 'nunique'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()

In [6]:
# 2. Add customer profile features (Region, SignupDate)
customer_profile = customers[['CustomerID', 'Region', 'SignupDate']]

# Convert SignupDate to number of days since signup
customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])
customer_profile['days_since_signup'] = (pd.to_datetime('today') - customer_profile['SignupDate']).dt.days

# Merge profile features with transaction features
customer_data = pd.merge(customer_features, customer_profile, on='CustomerID', how='left')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])


In [7]:
# 3. Normalize numerical features
scaler = StandardScaler()
numerical_features = ['total_spent', 'num_transactions', 'num_products_bought', 'avg_transaction_value', 'days_since_signup']
customer_data[numerical_features] = scaler.fit_transform(customer_data[numerical_features])

In [8]:
# 4. Prepare the data for similarity calculation
customer_data.set_index('CustomerID', inplace=True)

In [9]:
# 5. Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_data[numerical_features])

In [10]:
# 6. Create a dictionary to store the top 3 similar customers for each of the first 20 customers
lookalikes = defaultdict(list)

for i in range(20):  # First 20 customers (C0001 to C0020)
    customer_id = f'C{i+1:04}'
    similarities = similarity_matrix[i]

    # Get the similarity scores and sort by highest similarity (excluding self)
    similar_customers = sorted([(customer_data.index[j], similarities[j]) for j in range(len(similarities)) if customer_data.index[j] != customer_id], key=lambda x: x[1], reverse=True)

    # Select top 3 similar customers
    lookalikes[customer_id] = [(customer, score) for customer, score in similar_customers[:3]]

# Convert the lookalikes dictionary into a DataFrame
lookalike_df = pd.DataFrame([(cust_id, sim[0], sim[1]) for cust_id, sim_list in lookalikes.items() for sim in sim_list],
                            columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save the results to Lookalike.csv
lookalike_df.to_csv('Ayush_Raj_Lookalike.csv', index=False)

# Show the first few rows of the lookalike recommendations
print(lookalike_df.head())


  CustomerID LookalikeCustomerID  SimilarityScore
0      C0001               C0152         0.999290
1      C0001               C0160         0.964656
2      C0001               C0134         0.931192
3      C0002               C0029         0.995701
4      C0002               C0192         0.980302
