<h3>Import Libraries</h3>

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Load the data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

# Merge customers and transactions data
merged_df = pd.merge(transactions_df, customers_df, on="CustomerID", how="inner")
merged_df = pd.merge(merged_df, products_df, on="ProductID", how="inner")

# Check the first few rows of the merged data
merged_df.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


<h3>3. Aggregate Transaction Data for Each Customer</h3>

In [2]:
# Aggregating transaction data for each customer
customer_agg = merged_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    purchase_count=('TransactionID', 'count'),
    avg_spent_per_transaction=('TotalValue', 'mean'),
    most_common_category=('Category', lambda x: x.mode()[0])  # Most frequent product category
).reset_index()

# Merge with demographic data
customer_features = pd.merge(customers_df[['CustomerID', 'Region']], customer_agg, on='CustomerID')

# Show the customer features
customer_features.head()


Unnamed: 0,CustomerID,Region,total_spend,purchase_count,avg_spent_per_transaction,most_common_category
0,C0001,South America,3354.52,5,670.904,Electronics
1,C0002,Asia,1862.74,4,465.685,Clothing
2,C0003,South America,2725.38,4,681.345,Home Decor
3,C0004,South America,5354.88,8,669.36,Books
4,C0005,Asia,2034.24,3,678.08,Electronics


<h3>4. One-hot Encoding(Region)</h3>

In [3]:
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

<h3>5. Separate Numerical and Categorical Features</h3>

In [4]:
numerical_columns = ['total_spend', 'purchase_count', 'avg_spent_per_transaction']
# In this case, Region columns will be the one-hot encoded columns like 'Region_Africa', 'Region_Asia', etc.
categorical_columns = [col for col in customer_features.columns if 'Region_' in col]

# Separate the numerical features for scaling
numerical_data = customer_features[numerical_columns]

# Apply StandardScaler to the numerical features only
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(numerical_data)

<h3>6. Combine Scaled Numerical Features with Categorical Features</h3>

In [5]:
scaled_customer_features = pd.DataFrame(scaled_numerical_data, columns=numerical_columns)
categorical_data = customer_features[categorical_columns]

# Combine scaled numerical features with categorical features
final_customer_features = pd.concat([scaled_customer_features, categorical_data], axis=1)

<h3>7. Calculate the Cosine Similarity Matrix</h3>

In [6]:
similarity_matrix = cosine_similarity(final_customer_features)

<h3>8. Convert the Similarity Matrix to a DataFrame for Easy Access</h3>

In [7]:
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

<h3>9. Get the Top 3 Similar Customers for Each of the First 20 Customers (C0001 to C0020)</h3>

In [8]:
lookalikes = {}

for customer_id in customer_features['CustomerID'][:20]:
    # Get similarity scores for the current customer, excluding self-similarity
    similarity_scores = similarity_df[customer_id].drop(customer_id)
    
    # Get the top 3 similar customers
    top_3_similar = similarity_scores.nlargest(3)
    
    # Store the results in the lookalikes dictionary
    lookalikes[customer_id] = [(cust_id, score) for cust_id, score in top_3_similar.items()]

<h3>10. Convert the Lookalikes Data into a DataFrame for Output</h3>

In [9]:
lookalike_records = []
for customer_id, similar_customers in lookalikes.items():
    for similar_id, score in similar_customers:
        lookalike_records.append({'CustomerID': customer_id, 'LookalikeCustomerID': similar_id, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_records)

<h3>11. Save the results</h3>

In [11]:
# Step 12: Save the Results to 'Lookalike.csv'
lookalike_df.to_csv('Lookalike.csv', index=False)

# Output: The lookalike recommendations for customers C0001 to C0020 are now saved in Lookalike.csv.
