**Task 2: Lookalike Model**

In [4]:
import pandas as pd

In [5]:
#load datasets
customers = pd.read_csv("C:/Users/91949/Downloads/Customers - Customers.csv")
products = pd.read_csv("C:/Users/91949/Downloads/Products.csv")
transactions = pd.read_csv("C:/Users/91949/Downloads/Transactions - Transactions.csv")

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [7]:
#changing datatypes
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [8]:
#merging datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [10]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   TransactionID    1000 non-null   object        
 1   CustomerID       1000 non-null   object        
 2   ProductID        1000 non-null   object        
 3   TransactionDate  1000 non-null   datetime64[ns]
 4   Quantity         1000 non-null   int64         
 5   TotalValue       1000 non-null   float64       
 6   Price_x          1000 non-null   float64       
 7   CustomerName     1000 non-null   object        
 8   Region           1000 non-null   object        
 9   SignupDate       1000 non-null   datetime64[ns]
 10  ProductName      1000 non-null   object        
 11  Category         1000 non-null   object        
 12  Price_y          1000 non-null   float64       
dtypes: datetime64[ns](2), float64(3), int64(1), object(7)
memory usage: 101.7+ KB


In [11]:
#feature engineering
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean'
}).reset_index()

In [12]:
#scaling features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

In [15]:
#similarity calculation
similarity_matrix = cosine_similarity(scaled_features)

In [17]:
import numpy as np

In [18]:
#recommendations for first 20 customers
lookalikes = {}
for i in range(20):
    similar_customers = np.argsort(-similarity_matrix[i])[1:4]
    lookalikes[customer_features.iloc[i]['CustomerID']] = [
        (customer_features.iloc[j]['CustomerID'], similarity_matrix[i, j]) for j in similar_customers
    ]

#save recommendations
lookalike_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Similar1', 'Similar2', 'Similar3'])
lookalike_df.to_csv('Lookalike.csv')

In [20]:
from IPython.display import FileLink

#generate a download link for the file
display(FileLink('Lookalike.csv'))