In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [3]:
customers.dtypes

CustomerID      object
CustomerName    object
Region          object
SignupDate      object
dtype: object

In [4]:
products.dtypes

ProductID       object
ProductName     object
Category        object
Price          float64
dtype: object

In [5]:
transactions.dtypes

TransactionID       object
CustomerID          object
ProductID           object
TransactionDate     object
Quantity             int64
TotalValue         float64
Price              float64
dtype: object

In [6]:
pd.set_option('display.max_rows', 1000)

In [7]:
# Basic Information
print('Customer Info:')
customers.info()

Customer Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB


In [8]:
print('Product Info:')
products.info()

Product Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.3+ KB


In [9]:
print('Transactions Info:')
transactions.info()

Transactions Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price            1000 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 54.8+ KB


In [10]:
# Checking for missing values
print("\n Missing values in Customers: \n", customers.isnull().sum())
print("\n Missing values in Products: \n", products.isnull().sum())
print("\n Missing valus in Transacions \n", transactions.isnull().sum())


 Missing values in Customers: 
 CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

 Missing values in Products: 
 ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64

 Missing valus in Transacions 
 TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [12]:
# Descriptive statistics
print('\n Customers Statistics: \n', customers.describe())
print('\n Products Statistics: \n', products.describe())
print('\n Transactions Statistics:\n', transactions.describe())


 Customers Statistics: 
        CustomerID      CustomerName         Region  SignupDate
count         200               200            200         200
unique        200               200              4         179
top         C0001  Lawrence Carroll  South America  2024-11-11
freq            1                 1             59           3

 Products Statistics: 
             Price
count  100.000000
mean   267.551700
std    143.219383
min     16.080000
25%    147.767500
50%    292.875000
75%    397.090000
max    497.760000

 Transactions Statistics:
           Quantity   TotalValue       Price
count  1000.000000  1000.000000  1000.00000
mean      2.537000   689.995560   272.55407
std       1.117981   493.144478   140.73639
min       1.000000    16.080000    16.08000
25%       2.000000   295.295000   147.95000
50%       3.000000   588.880000   299.93000
75%       4.000000  1011.660000   404.40000
max       4.000000  1991.040000   497.76000


In [13]:
# Data preprocessing
transactions = transactions.merge(products, on = "ProductID", how = "left")


In [25]:
# Calculate TransactionAmount using Quantity and Price_x
if 'TransactionAmount' not in transactions.columns:
    if 'Quantity' in transactions.columns and 'Price_x' in transactions.columns:
        transactions['TransactionAmount'] = transactions['Quantity'] * transactions['Price_x']
    else:
        raise KeyError("Columns required to compute 'TransactionAmount' are missing (e.g., 'Quantity' and 'Price_x').")

In [26]:
# Aggregate transaction data to get customer-level features
customer_transactions = transactions.groupby("CustomerID").agg({
    "ProductID": "count",  # Number of products purchased
    "TransactionAmount": "sum",  # Total amount spent
    "Category": lambda x: x.mode()[0] if not x.mode().empty else None  # Most purchased category
}).reset_index()
customer_transactions.rename(columns={"ProductID": "TotalPurchases", "TransactionAmount": "TotalSpent"}, inplace=True)

In [27]:
# Merge customer profiles with transaction features
customer_data = customers.merge(customer_transactions, on="CustomerID", how="left")

In [28]:
# Fill missing values
customer_data.fillna({"TotalPurchases": 0, "TotalSpent": 0, "Category": "Unknown"}, inplace=True)

In [29]:
# Encode categorical features (e.g., Category)
customer_data = pd.get_dummies(customer_data, columns=["Category"], drop_first=True)

In [32]:
# Step 2: Similarity computation
# Verify that Age and Income columns exist before proceeding
if 'Age' not in customer_data.columns or 'Income' not in customer_data.columns:
    print("Columns 'Age' and/or 'Income' are missing. Defaulting to available features.")
    numeric_features = [col for col in ["TotalPurchases", "TotalSpent"] if col in customer_data.columns]
else:
    numeric_features = ["Age", "Income", "TotalPurchases", "TotalSpent"]


Columns 'Age' and/or 'Income' are missing. Defaulting to available features.


In [33]:
# Standardize numeric features
scaler = StandardScaler()
customer_data[numeric_features] = scaler.fit_transform(customer_data[numeric_features])


In [34]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_data[numeric_features])


In [35]:
# Step 3: Generate lookalikes for the first 20 customers
lookalike_map = {}
for i, customer_id in enumerate(customer_data["CustomerID"][:20]):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_data.iloc[j]["CustomerID"], score) for j, score in similarity_scores[1:4]]
    lookalike_map[customer_id] = top_3


In [37]:
from sklearn.model_selection import train_test_split
# Split data into train-test sets
train_customers, test_customers = train_test_split(customer_data, test_size=0.2, random_state=42)


In [38]:
# Generate lookalikes for test customers
def generate_lookalikes(customer_id, similarity_matrix, customer_data, top_n=3):
    idx = customer_data.index[customer_data['CustomerID'] == customer_id][0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    return [(customer_data.iloc[j]["CustomerID"], round(score, 4)) for j, score in similarity_scores[1:top_n+1]]


In [39]:
lookalike_map = {}
for customer_id in test_customers["CustomerID"]:
    lookalike_map[customer_id] = generate_lookalikes(customer_id, similarity_matrix, customer_data)


In [40]:
# Step 4: Quality metrics
# Mean similarity scores for recommendations
mean_similarity = sum(
    [score for lookalikes in lookalike_map.values() for _, score in lookalikes]
) / (len(lookalike_map) * 3)

In [41]:
# Coverage
coverage = len(lookalike_map) / len(test_customers)

# Diversity (number of unique customers in recommendations)
unique_recommendations = len(set([rec[0] for lookalikes in lookalike_map.values() for rec in lookalikes]))

In [42]:
# Output results
print(f"Mean Similarity Score: {mean_similarity:.4f}")
print(f"Recommendation Coverage: {coverage:.2%}")
print(f"Diversity of Recommendations: {unique_recommendations}")

Mean Similarity Score: 0.9966
Recommendation Coverage: 100.00%
Diversity of Recommendations: 76


In [44]:
# Step 5: Save results for further manual inspection
lookalike_df = pd.DataFrame({
    "CustomerID": list(lookalike_map.keys()),
    "Lookalikes": [str(lookalikes) for lookalikes in lookalike_map.values()]
})
lookalike_df.to_csv("Looklike.csv", index=False)

print("Evaluation completed. Results saved in Lookalike_Evaluation.csv.")

Evaluation completed. Results saved in Lookalike_Evaluation.csv.
