In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge datasets to get transaction details with customer and product information
merged_data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

Feature Engineering

In [16]:

# Aggregate transaction data for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: ' '.join(x)  # Combine product categories purchased
}).reset_index()

In [17]:
# Add customer profile information
customer_profiles = customers.merge(customer_features, on='CustomerID')



# Step 2: Vectorization
# Use TF-IDF to vectorize the product categories



In [18]:
tfidf = TfidfVectorizer()
category_matrix = tfidf.fit_transform(customer_profiles['Category'])

In [19]:
# Combine numerical features (TotalValue, Quantity) and the TF-IDF matrix
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(customer_profiles[['TotalValue', 'Quantity']])
combined_features = pd.concat(
    [pd.DataFrame(numerical_features, columns=['Scaled_TotalValue', 'Scaled_Quantity']),
     pd.DataFrame(category_matrix.toarray())], axis=1
)



Calculate Similarity Scores

In [20]:
similarity_matrix = cosine_similarity(combined_features)
print(similarity_matrix)



[[1.         0.42148609 0.7058767  ... 0.58889663 0.8500374  0.6878206 ]
 [0.42148609 1.         0.917673   ... 0.42456906 0.69687393 0.82660654]
 [0.7058767  0.917673   1.         ... 0.44278997 0.91281466 0.828748  ]
 ...
 [0.58889663 0.42456906 0.44278997 ... 1.         0.40903885 0.6886093 ]
 [0.8500374  0.69687393 0.91281466 ... 0.40903885 1.         0.65115796]
 [0.6878206  0.82660654 0.828748   ... 0.6886093  0.65115796 1.        ]]


Generate Lookalikes


In [21]:
def get_top_3_lookalikes(customer_index, similarity_matrix, customer_ids):
    # Sort by similarity score, excluding the customer itself
    similar_indices = similarity_matrix[customer_index].argsort()[::-1][1:4]
    return [(customer_ids[i], round(similarity_matrix[customer_index][i], 4)) for i in similar_indices]

# Generate lookalike recommendations for customers C0001 - C0020
customer_ids = customer_profiles['CustomerID'].tolist()
lookalike_results = {}

for i in range(20):  # First 20 customers (C0001 - C0020)
    lookalike_results[customer_ids[i]] = get_top_3_lookalikes(i, similarity_matrix, customer_ids)

# Save lookalike results to a CSV file in the required format
lookalike_map = []
for cust_id, lookalikes in lookalike_results.items():
    lookalike_map.append({
        'CustomerID': cust_id,
        'Lookalikes': str(lookalikes)  # Convert list of tuples to string
    })

lookalike_df = pd.DataFrame(lookalike_map)
lookalike_df.to_csv('Bhuvana_Samala_Lookalike.csv', index=False)

print("Lookalike recommendations saved successfully!")

Lookalike recommendations saved successfully!
