In [26]:
!pip install pandas scikit-learn
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')
customers = pd.read_csv('/content/Customers.csv')


customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
}).reset_index()
customer_data = customers.merge(customer_transactions, on='CustomerID', how='left')
customer_data['TotalValue'].fillna(0, inplace=True)
customer_data['Quantity'].fillna(0, inplace=True)
customer_data = pd.get_dummies(customer_data, columns=['Region'], drop_first=True)
customer_product_matrix = transactions.pivot_table(
    index='CustomerID', columns='ProductID', values='Quantity', fill_value=0
)
common_customer_ids = customer_data['CustomerID'].isin(customer_product_matrix.index)
customer_data = customer_data[common_customer_ids]
customer_product_matrix = customer_product_matrix.loc[customer_data['CustomerID']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_data[['TotalValue', 'Quantity']])
customer_features = np.hstack([scaled_features, customer_data.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']).values])
customer_features = np.hstack([customer_features, customer_product_matrix.values])
similarity_matrix = cosine_similarity(customer_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])
lookalike_map = {}

for customer_id in customer_data['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))

lookalike_df = pd.DataFrame([{'cust_id': k, 'lookalikes': v} for k, v in lookalike_map.items()])
lookalike_df.to_csv('/content/Lookalike1.csv', index=False)
lookalike_df.head(20)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_data['TotalValue'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_data['Quantity'].fillna(0, inplace=True)


Unnamed: 0,cust_id,lookalikes
0,C0001,"[(C0065, 0.9999986147207881), (C0194, 0.999998..."
1,C0002,"[(C0030, 0.9999954623713694), (C0109, 0.999995..."
2,C0003,"[(C0181, 0.9999971198296798), (C0008, 0.999996..."
3,C0004,"[(C0175, 0.9999982314165351), (C0065, 0.999998..."
4,C0005,"[(C0096, 0.9999981246476936), (C0055, 0.999997..."
5,C0006,"[(C0040, 0.9999991728180859), (C0196, 0.999998..."
6,C0007,"[(C0079, 0.9999982540112695), (C0118, 0.999997..."
7,C0008,"[(C0165, 0.9999983622167101), (C0084, 0.999997..."
8,C0009,"[(C0140, 0.9999960205548885), (C0198, 0.999995..."
9,C0010,"[(C0091, 0.9999919411461613), (C0034, 0.999991..."




No charts were generated by quickchart
