In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Load the datasets
customers = pd.read_csv(r"C:\Users\ziniy\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\ziniy\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\ziniy\Downloads\Transactions.csv")

# Inspect datasets
print(customers.info())
print(products.info())
print(transactions.info())

# Check for missing values
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

# Merge datasets for a comprehensive view
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ProductID    100 non-null    object 
 1   ProductName  100 non-null    object 
 2   Category     100 non-null    object 
 3   Price        100 non-null    float64
dtypes: float64(1), object(3)
memory usage: 3.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------

In [4]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the data (assuming it's already merged in 'data' variable)
# If not, merge the relevant DataFrames as needed.
# data = pd.merge(transactions, customers, on="CustomerID", how="left")
# data = pd.merge(data, products, on="ProductID", how="left")

# Feature engineering (TotalValue and Quantity per customer)
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
}).reset_index()

# Normalize the features (important for cosine similarity)
scaler = StandardScaler()
features_normalized = scaler.fit_transform(customer_features[['TotalValue', 'Quantity']])

# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(features_normalized)

# Create a dictionary to hold the lookalikes and their similarity scores
lookalikes = {}

# Get top 3 similar customers for each customer
for i, customer_id in enumerate(customer_features['CustomerID']):
    sim_scores = list(enumerate(similarity_matrix[i]))
    
    # Sort by similarity scores in descending order and skip the first entry (self-comparison)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_similar = [(customer_features['CustomerID'][x[0]], x[1]) for x in sim_scores[1:4]]  # Skip self
    
    # Store the results in the dictionary
    lookalikes[customer_id] = top_similar

# Prepare the data for saving into a DataFrame
lookalike_list = []

for customer_id, similar_customers in lookalikes.items():
    row = {
        'CustomerID': customer_id,
        'Lookalike1': similar_customers[0][0], 'Score1': similar_customers[0][1],
        'Lookalike2': similar_customers[1][0], 'Score2': similar_customers[1][1],
        'Lookalike3': similar_customers[2][0], 'Score3': similar_customers[2][1]
    }
    lookalike_list.append(row)

# Create a DataFrame from the list of dictionaries
lookalike_df = pd.DataFrame(lookalike_list)

# Output the DataFrame to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

# Display a sample of the output for verification
print(lookalike_df.head())


  CustomerID Lookalike1    Score1 Lookalike2    Score2 Lookalike3    Score3
0      C0001      C0085  0.999999      C0042  0.999822      C0089  0.999785
1      C0002      C0157  0.999994      C0166  0.999875      C0029  0.999825
2      C0003      C0111  0.994008      C0160  0.990455      C0147  0.987638
3      C0004      C0162  1.000000      C0165  0.999959      C0090  0.998641
4      C0005      C0080  0.999982      C0167  0.999975      C0177  0.999928


In [5]:
import pandas as pd

# Sample data
data = {
    'CustomerID': ['C0001', 'C0002', 'C0003', 'C0004', 'C0005'],
    'Lookalike1': ['C0085', 'C0157', 'C0111', 'C0162', 'C0080'],
    'Score1': [0.999999, 0.999994, 0.994008, 1.000000, 0.999982],
    'Lookalike2': ['C0042', 'C0166', 'C0160', 'C0165', 'C0167'],
    'Score2': [0.999822, 0.999875, 0.990455, 0.999959, 0.999975],
    'Lookalike3': ['C0089', 'C0029', 'C0147', 'C0090', 'C0177'],
    'Score3': [0.999785, 0.999825, 0.987638, 0.998641, 0.999928],
}

# Create the DataFrame
lookalike_df = pd.DataFrame(data)

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model output saved to 'Lookalike.csv'")


Lookalike model output saved to 'Lookalike.csv'


In [7]:
# Replace <YourUsername> with your actual username
lookalike_df.to_csv(r"C:\Users\ziniy\Downloads\Lookalike.csv", index=False)

print("Lookalike model output saved to your Desktop.")


Lookalike model output saved to your Desktop.
