In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [8]:
# Load datasets
customers = pd.read_csv('C:\Data-Science-Assignment-eCommerce-Transactions-Dataset\Dataset\Customers .csv')
products = pd.read_csv('C:\Data-Science-Assignment-eCommerce-Transactions-Dataset\Dataset\Products.csv')
transactions = pd.read_csv('C:\Data-Science-Assignment-eCommerce-Transactions-Dataset\Dataset\Transactions.csv')

# Merge transactions with products to include product details
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')

# Merge with customers to include customer details
data = pd.merge(transactions_products, customers, on='CustomerID', how='left')

In [9]:
data[['Price_x', 'Price_y']].head()

Unnamed: 0,Price_x,Price_y
0,300.68,300.68
1,300.68,300.68
2,300.68,300.68
3,300.68,300.68
4,300.68,300.68


## There is no difference between Price_x and Price_y.So we can remove any 1

In [10]:
# Keep Price_y as the correct Price column
data['Price'] = data['Price_y']  # Rename Price_y to Price

# Drop Price_x and Price_y to avoid confusion
data.drop(columns=['Price_x', 'Price_y'], inplace=True)

# Verify the updated DataFrame
print(data.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'ProductName', 'Category', 'CustomerName',
       'Region', 'SignupDate', 'Price'],
      dtype='object')


In [11]:
# Aggregate transaction data to create customer profiles
customer_profiles = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',    # Total quantity purchased
    'Price': 'mean',      # Average price of items purchased
    'Category': lambda x: ' '.join(x.astype(str))  # Categories of products purchased
}).reset_index()

# Inspect the customer profiles
print(customer_profiles.head())

  CustomerID  TotalValue  Quantity       Price  \
0      C0001     3354.52        12  278.334000   
1      C0002     1862.74        10  208.920000   
2      C0003     2725.38        14  195.707500   
3      C0004     5354.88        23  240.636250   
4      C0005     2034.24         7  291.603333   

                                            Category  
0  Books Home Decor Electronics Electronics Elect...  
1            Home Decor Home Decor Clothing Clothing  
2         Home Decor Home Decor Clothing Electronics  
3  Books Home Decor Home Decor Home Decor Books B...  
4                 Home Decor Electronics Electronics  


In [12]:
# One-hot encode the Category column
categories_onehot = customer_profiles['Category'].str.get_dummies(sep=' ')
customer_profiles = pd.concat([customer_profiles, categories_onehot], axis=1)

# Drop the original 'Category' column
customer_profiles.drop(columns=['Category'], inplace=True)

# Inspect the final customer profiles
print(customer_profiles.head())

  CustomerID  TotalValue  Quantity       Price  Books  Clothing  Decor  \
0      C0001     3354.52        12  278.334000      1         0      1   
1      C0002     1862.74        10  208.920000      0         1      1   
2      C0003     2725.38        14  195.707500      0         1      1   
3      C0004     5354.88        23  240.636250      1         0      1   
4      C0005     2034.24         7  291.603333      0         0      1   

   Electronics  Home  
0            1     1  
1            0     1  
2            1     1  
3            1     1  
4            1     1  


In [13]:
# Normalize the numerical features
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'Price']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Inspect normalized profiles
print(customer_profiles.head())

  CustomerID  TotalValue  Quantity     Price  Books  Clothing  Decor  \
0      C0001   -0.061701 -0.122033  0.094670      1         0      1   
1      C0002   -0.877744 -0.448000 -0.904016      0         1      1   
2      C0003   -0.405857  0.203934 -1.094109      0         1      1   
3      C0004    1.032547  1.670787 -0.447702      1         0      1   
4      C0005   -0.783929 -0.936951  0.285581      0         0      1   

   Electronics  Home  
0            1     1  
1            0     1  
2            1     1  
3            1     1  
4            1     1  


In [14]:
# Extract feature matrix for similarity calculation
features = customer_profiles.drop(columns=['CustomerID']).values

# Compute cosine similarity
similarity_matrix = cosine_similarity(features)

# Convert the similarity matrix into a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

# Inspect similarity matrix
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.460710  0.620905  0.647716  0.743160  0.688551   
C0002       0.460710  1.000000  0.836309  0.120724  0.608911  0.391806   
C0003       0.620905  0.836309  1.000000  0.517006  0.566163  0.359511   
C0004       0.647716  0.120724  0.517006  1.000000  0.081906  0.449691   
C0005       0.743160  0.608911  0.566163  0.081906  1.000000  0.428317   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.712708  0.714148  0.254506  0.188211  ...  0.698224  0.523961   
C0002       0.355025  0.453444  0.459548  0.649903  ...  0.063342  0.406903   
C0003       0.344167  0.768225  0.303603  0.574909  ...  0.252704  0.403190   
C0004       0.109391  0.895204 -0.448097  0.083254  ...  0.283126 -0.029686   
C0005  

In [15]:
# Define the list of first 20 customers
first_20_customers = customers['CustomerID'][:20]

# Create a dictionary to store lookalike recommendations
lookalike_recommendations = {}

for customer in first_20_customers:
    # Get similarity scores for the current customer
    similarities = similarity_df[customer]
    
    # Find the top 3 similar customers (excluding the current customer)
    top_similar = similarities.sort_values(ascending=False).iloc[1:4]
    
    # Store results in the dictionary
    lookalike_recommendations[customer] = list(zip(top_similar.index, top_similar.values))

# Inspect recommendations
print(lookalike_recommendations)

{'C0001': [('C0152', np.float64(0.9840747534870863)), ('C0047', np.float64(0.9743347715237054)), ('C0064', np.float64(0.9733438868355152))], 'C0002': [('C0062', np.float64(0.9879562559926719)), ('C0144', np.float64(0.9712912534049278)), ('C0159', np.float64(0.9581070853251885))], 'C0003': [('C0106', np.float64(0.9621388024035196)), ('C0166', np.float64(0.9202976457032854)), ('C0091', np.float64(0.9166636215040813))], 'C0004': [('C0012', np.float64(0.9667278429035313)), ('C0018', np.float64(0.9605484448837079)), ('C0075', np.float64(0.9450002768853324))], 'C0005': [('C0140', np.float64(0.9917200040362902)), ('C0069', np.float64(0.9522869816711161)), ('C0199', np.float64(0.9489593427936945))], 'C0006': [('C0079', np.float64(0.9890410958928005)), ('C0139', np.float64(0.9714404587450668)), ('C0187', np.float64(0.9542973356508766))], 'C0007': [('C0005', np.float64(0.9164211120642998)), ('C0085', np.float64(0.9163649694370852)), ('C0146', np.float64(0.905512396700941))], 'C0008': [('C0017', 

In [16]:
### Saving the CSV file
lookalike_list = []

for customer, recommendations in lookalike_recommendations.items():
    for similar_customer, score in recommendations:
        lookalike_list.append({
            'CustomerID': customer,
            'SimilarCustomerID': similar_customer,
            'SimilarityScore': score
        })

lookalike_df = pd.DataFrame(lookalike_list)

# Save to CSV
lookalike_df.to_csv('amit_rai_Lookalike.csv', index=False)