In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from google.colab import files

uploaded = files.upload()

customers_df = pd.read_csv('/content/Customers.csv')
products_df = pd.read_csv('/content/Products.csv')

Saving Customers.csv to Customers.csv
Saving Products.csv to Products.csv


In [3]:
print(customers_df.columns)

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')


In [4]:
# preprocessing- basically just merging customer and transaction history
# here, we will assume each customer interacts with random products as a placeholder
import random
random.seed(42)

In [5]:
# generate mock transaction history by assigning random product IDs to each customer
product_ids = products_df['ProductID'].tolist()
customers_df['TransactionHistory'] = customers_df['CustomerID'].apply(
    lambda x: ','.join(random.sample(product_ids, min(len(product_ids), 5)))
)

In [6]:
# merging product information into transaction history
def enrich_transaction_history(transaction_history):
    product_ids = transaction_history.split(',')
    categories = products_df[products_df['ProductID'].isin(product_ids)]['Category'].tolist()
    return ' '.join(categories)

customers_df['EnrichedHistory'] = customers_df['TransactionHistory'].apply(enrich_transaction_history)

In [7]:
# combining features for similarity computation
def combine_features(row):
    return f"{row['CustomerName']} {row['Region']} {row['SignupDate']} {row['EnrichedHistory']}"

customers_df['CombinedFeatures'] = customers_df.apply(combine_features, axis=1)

In [8]:
# vectorize the combined features
vectorizer = TfidfVectorizer()
feature_matrix = vectorizer.fit_transform(customers_df['CombinedFeatures'])

# compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

In [9]:
# generating recommendations for the first 20 customers
lookalikes = {}
customer_ids = customers_df['CustomerID'][:20]
for idx, cust_id in enumerate(customer_ids):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # sorting by similarity score
    # also excluding the customer itself
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = [(customers_df.iloc[i]['CustomerID'], score) for i, score in similarity_scores[1:4]]
    lookalikes[cust_id] = top_lookalikes

In [14]:
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalike_list)}
    for cust_id, lookalike_list in lookalikes.items()
])
lookalike_df.to_csv('Asmita_Verma_Lookalike.csv', index=False)

print("Lookalike model completed. Results saved to 'Lookalike.csv'.")

Lookalike model completed. Results saved to 'Lookalike.csv'.
