In [2]:
!gdown "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
!gdown "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
!gdown "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Feature Engineering: Create a combined feature for each customer
merged_df = pd.merge(transactions, products, on='ProductID', how='left')
merged_df = pd.merge(merged_df, customers, on='CustomerID', how='left')

# Combine relevant features into a single string for each customer
merged_df['combined_features'] = merged_df['Region'] + ' ' + merged_df['Category'] + ' ' + merged_df['ProductName']
customer_features = merged_df.groupby('CustomerID')['combined_features'].apply(lambda x: ' '.join(x))


# TF-IDF Vectorization: Convert text features to numerical vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(customer_features)

# Calculate Cosine Similarity: Find similar customers based on the vectors
cosine_sim = cosine_similarity(tfidf_matrix)


def recommend_customers(customer_id, cosine_sim_matrix, customer_features_df, top_n=3):
    """Recommends similar customers based on cosine similarity."""
    try:
        index = customer_features_df.index.get_loc(customer_id)
    except KeyError:
        print(f"Customer ID {customer_id} not found.")
        return []

    similarity_scores = list(enumerate(cosine_sim_matrix[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:top_n + 1]  # Exclude the customer itself

    recommendations = []
    for index, score in similarity_scores:
        recommendations.append((customer_features_df.index[index], score))  # Return customer ID and score

    return recommendations

data = []
for x in range(20):
    customer_id_to_recommend = customers["CustomerID"].iloc[x]
    recommendations = recommend_customers(customer_id_to_recommend, cosine_sim, customer_features)

    print(f"Recommendations for customer {customer_id_to_recommend}:")
    for customer_id, score in recommendations:
        data.append({"custid": customer_id_to_recommend, "lookalike_custid": customer_id, "score": score})
        print(f"Customer ID: {customer_id}, Similarity Score: {score}")
    print()
pd.DataFrame(data).to_csv("Likitha_Baddepalli_Lookalike.csv", index=False)

Downloading...
From: https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE
To: /content/Customers.csv
100% 8.54k/8.54k [00:00<00:00, 26.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0
To: /content/Products.csv
100% 4.25k/4.25k [00:00<00:00, 14.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF
To: /content/Transactions.csv
100% 54.7k/54.7k [00:00<00:00, 79.7MB/s]
Recommendations for customer C0001:
Customer ID: C0039, Similarity Score: 0.8965344367652105
Customer ID: C0190, Similarity Score: 0.888469196466387
Customer ID: C0096, Similarity Score: 0.887270644941772

Recommendations for customer C0002:
Customer ID: C0173, Similarity Score: 0.9129770184473479
Customer ID: C0022, Similarity Score: 0.8841360179639877
Customer ID: C0162, Similarity Score: 0.8449190544679047

Recommendations for customer C0003:
Customer ID: C0181, Similarity Score: 0.9055768454266909
Customer ID: C0085, Similarit