In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd


In [10]:
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')


In [11]:
customer_transactions = transactions_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
)


In [12]:
customer_profiles = pd.merge(customers_df, customer_transactions, on='CustomerID')

In [13]:
scaler = StandardScaler()
scaled_profiles = scaler.fit_transform(customer_profiles[['total_spent', 'transaction_count', 'unique_products']])


In [14]:
# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(scaled_profiles)

In [15]:
# Function to get top 3 similar customers
def get_top_3_similar_customers(customer_id):
    sim_scores = similarity_matrix[customer_id]  # similarity_matrix is calculated using cosine similarity
    sim_scores_with_ids = list(enumerate(sim_scores))
    sim_scores_with_ids.sort(key=lambda x: x[1], reverse=True)  # Sort by similarity score in descending order
    top_3_similar = sim_scores_with_ids[1:4]  # Skip the first one as it's the customer itself
    return [(customer_profiles.iloc[x[0]]['CustomerID'], x[1]) for x in top_3_similar]


In [16]:
lookalike_data = []

In [17]:
for customer_id in range(20):
    similar_customers = get_top_3_similar_customers(customer_id)
    
    # Flatten the list of tuples (CustomerID, Score) and store it
    lookalike_data.append([
        customer_profiles.iloc[customer_id]['CustomerID'],
        similar_customers[0][0], similar_customers[0][1],  # CustomerID_1, Score_1
        similar_customers[1][0], similar_customers[1][1],  # CustomerID_2, Score_2
        similar_customers[2][0], similar_customers[2][1]   # CustomerID_3, Score_3
    ])

In [18]:
lookalike_df = pd.DataFrame(
    lookalike_data, 
    columns=["Original_CustomerID", "CustomerID_1", "Score_1", "CustomerID_2", "Score_2", "CustomerID_3", "Score_3"]
)


In [19]:
lookalike_df.to_csv('Abhi_Lookalike.csv', index=False)

In [20]:
print(lookalike_df.head())

  Original_CustomerID CustomerID_1   Score_1 CustomerID_2   Score_2  \
0               C0001        C0137  0.996332        C0152  0.986905   
1               C0002        C0029  0.999758        C0199  0.999347   
2               C0003        C0178  0.999949        C0112  0.999570   
3               C0004        C0021  0.999903        C0075  0.999775   
4               C0005        C0073  0.999966        C0144  0.999954   

  CustomerID_3   Score_3  
0        C0056  0.930427  
1        C0010  0.999182  
2        C0131  0.999570  
3        C0108  0.999315  
4        C0095  0.999947  
