In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from datetime import datetime

In [None]:
from google.colab import files
file=files.upload()

Saving Transactions.csv to Transactions (1).csv
Saving Products.csv to Products (1).csv
Saving Customers.csv to Customers (1).csv


In [None]:
cust_df=pd.read_csv('Customers.csv')
trans_df=pd.read_csv('Transactions.csv')
prod_df=pd.read_csv('Products.csv')

In [None]:
#check the m odel for correct data-types

#sign up date is in object data type --> convert into datetime
cust_df['SignupDate']=pd.to_datetime(cust_df['SignupDate'])

# number of days from the signup date --> to check how old the customer is
today=datetime.today()
cust_df['Days']=(today-cust_df['SignupDate']).dt.days

In [None]:
cust_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,Days
0,C0001,Lawrence Carroll,South America,2022-07-10,934
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1081
2,C0003,Michael Rivera,South America,2024-03-07,328
3,C0004,Kathleen Rodriguez,South America,2022-10-09,843
4,C0005,Laura Weber,Asia,2022-08-15,898


In [None]:
cust_df.dtypes

Unnamed: 0,0
CustomerID,object
CustomerName,object
Region,object
SignupDate,datetime64[ns]
Days,int64


In [None]:
def engineer_features(customer_id):
  cust_trans=trans_df[trans_df['CustomerID']==customer_id]
  total_spend=cust_trans['TotalValue'].sum()
  avg_order_value=cust_trans['TotalValue'].mean()
  num_transactions=len(cust_trans)
  category_preferences=cust_trans.merge(prod_df,on='ProductID')['Category'].value_counts(normalize=True)
  return pd.Series({
        'TotalSpend': total_spend,
        'AvgOrderValue': avg_order_value,
        'NumTransactions': num_transactions,
        'Books': category_preferences.get('Books', 0),
        'Electronics': category_preferences.get('Electronics', 0),
        'Clothing': category_preferences.get('Clothing', 0),
        'HomeDecor': category_preferences.get('Home Decor', 0)
    })


customer_features=cust_df.apply(lambda row:engineer_features(row['CustomerID']),axis=1)

In [None]:
# Combine customer attributes with engineered features
features_df = pd.concat([cust_df[['CustomerID', 'Region', 'Days']], customer_features], axis=1)

In [None]:
# One-hot encode Region
features_df = pd.get_dummies(features_df, columns=['Region'])
features_df.dropna(inplace=True)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
# Normalize features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features_df.drop('CustomerID', axis=1))


In [None]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(normalized_features)


In [None]:
# Find top 3 lookalikes for each customer
def get_top_lookalikes(customer_id, top_n=3):
    customer_index = features_df[features_df['CustomerID'] == customer_id].index[0]
    similarities = similarity_matrix[customer_index]
    top_indices = np.argsort(similarities)[::-1][1:top_n+1]  # Exclude self
    lookalikes = []
    for idx in top_indices:
        lookalike_id = features_df.iloc[idx]['CustomerID']
        similarity_score = similarities[idx]
        lookalikes.append((lookalike_id, similarity_score))

    return lookalikes

In [None]:
# Generate lookalikes for first 20 customers
lookalike_results = {}
for customer_id in cust_df['CustomerID'][:20]:
    lookalikes = get_top_lookalikes(customer_id)
    lookalike_results[customer_id] = lookalikes


In [None]:
# Create Lookalike.csv
lookalike_df = pd.DataFrame([(k, v) for k, v in lookalike_results.items()], columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been created with the top 3 lookalikes for the first 20 customers.")

Lookalike.csv has been created with the top 3 lookalikes for the first 20 customers.


In [None]:
df=pd.read_csv('Lookalike.csv')
df.head(20)

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0112', 0.8905838613871102), ('C0120', 0.84..."
1,C0002,"[('C0134', 0.9689453807735999), ('C0106', 0.92..."
2,C0003,"[('C0031', 0.9520266703295864), ('C0129', 0.91..."
3,C0004,"[('C0113', 0.9422889273085164), ('C0104', 0.88..."
4,C0005,"[('C0007', 0.9741821894267974), ('C0140', 0.86..."
5,C0006,"[('C0187', 0.9053629297168087), ('C0171', 0.75..."
6,C0007,"[('C0005', 0.9741821894267974), ('C0140', 0.89..."
7,C0008,"[('C0098', 0.8683048144228704), ('C0194', 0.85..."
8,C0009,"[('C0198', 0.8623173814558336), ('C0010', 0.83..."
9,C0010,"[('C0061', 0.9287916492055878), ('C0062', 0.86..."


In [None]:
# to download the csv
'''from google.colab import files
files.download('Lookalike.csv')'''

"from google.colab import files\nfiles.download('Lookalike.csv')"