In [12]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances

In [2]:
# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [4]:
# Merge data to create a unified dataset
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [5]:
# Feature engineering: Create a pivot table of customers and products with quantity as values
customer_product_matrix = merged_data.pivot_table(
    index='CustomerID',
    columns='ProductID',
    values='Quantity',
    fill_value=0
)

In [6]:
# Standardize the data
scaler = StandardScaler()
customer_product_matrix_scaled = scaler.fit_transform(customer_product_matrix)

In [7]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_product_matrix_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_product_matrix.index, columns=customer_product_matrix.index)

In [8]:
# Function to get top 3 similar customers for a given customer
def get_top_similar(customers_df, customer_id, n=3):
    similar_customers = customers_df[customer_id].sort_values(ascending=False).iloc[1:n+1]
    return similar_customers


In [9]:
# Create a lookalike dataset for CustomerID: C0001 - C0020
lookalike_results = {}
for customer_id in customer_product_matrix.index[:20]:
    top_similars = get_top_similar(similarity_df, customer_id)
    lookalike_results[customer_id] = [(similar_id, round(score, 4)) for similar_id, score in zip(top_similars.index, top_similars.values)]


In [10]:
# Save the results to a CSV in the required format
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])
lookalike_df.to_csv('Aditya_Sherkhane_Lookalike.csv', index=False)

print("Lookalike model results saved to 'Aditya_Sherkhane_Lookalike.csv'")

Lookalike model results saved to 'Aditya_Sherkhane_Lookalike.csv'


In [13]:
# Evaluate accuracy using mean squared error between similarity matrices
similarity_matrix_manual = 1 - pairwise_distances(customer_product_matrix_scaled, metric='cosine')
mse = ((similarity_matrix - similarity_matrix_manual) ** 2).mean()
print(f"Mean Squared Error between similarity matrices: {mse}")

Mean Squared Error between similarity matrices: 3.98633983366674e-33
