**Assignment Tasks:**

**Task - 2:**  Lookalike Model

In [2]:
from google.colab import files
import pandas as pd
import io

uploaded = files.upload()

print(uploaded.keys())


customer_file = [key for key in uploaded.keys() if 'Customers' in key][0]
product_file = [key for key in uploaded.keys() if 'Products' in key][0]
transaction_file = [key for key in uploaded.keys() if 'Transactions' in key][0]

customers = pd.read_csv(io.BytesIO(uploaded[customer_file]))
products = pd.read_csv(io.BytesIO(uploaded[product_file]))
transactions = pd.read_csv(io.BytesIO(uploaded[transaction_file]))

# Displays first few rows
customers.head(), products.head(), transactions.head()

Saving Transactions.csv to Transactions.csv
Saving Products.csv to Products.csv
Saving Customers.csv to Customers.csv
dict_keys(['Transactions.csv', 'Products.csv', 'Customers.csv'])


(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067   2024-04-25 7:38:55    

In [3]:
import pandas as pd


# 1. Checking for Duplicate Values
print("Duplicate values in Customers:", customers.duplicated().sum())
print("Duplicate values in Products:", products.duplicated().sum())
print("Duplicate values in Transactions:", transactions.duplicated().sum())


# 2. Checking for Null Values
print("\nNull values in Customers:\n", customers.isnull().sum())
print("\nNull values in Products:\n", products.isnull().sum())
print("\nNull values in Transactions:\n", transactions.isnull().sum())

products.dropna(subset=['ProductName'], inplace=True)

Duplicate values in Customers: 0
Duplicate values in Products: 0
Duplicate values in Transactions: 0

Null values in Customers:
 CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64

Null values in Products:
 ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64

Null values in Transactions:
 TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customer_product_matrix = pd.crosstab(transactions['CustomerID'], transactions['ProductID'])

customer_features = customers[['CustomerID', 'CustomerName', 'Region', 'SignupDate']]
customer_product_matrix = customer_product_matrix.merge(customer_features, on='CustomerID', how='left')

customer_product_matrix = customer_product_matrix.fillna(0)

# Setting 'CustomerID' as index
customer_product_matrix = customer_product_matrix.set_index('CustomerID')
numeric_features = customer_product_matrix.select_dtypes(include=['number'])

# 2. Data Scaling:
scaled_data = scaler.fit_transform(numeric_features.values)  # Use numeric_features instead of customer_product_matrix
scaled_df = pd.DataFrame(scaled_data, index=numeric_features.index, columns=numeric_features.columns)

# 3. Similarity Calculation:
similarity_matrix = cosine_similarity(scaled_df)
similarity_df = pd.DataFrame(similarity_matrix, index=scaled_df.index, columns=scaled_df.index)

# 4. Lookalike Recommendation:

def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    """
    Returns the top N lookalikes for a given customer.
    """

    similarity_scores = similarity_df.loc[customer_id].sort_values(ascending=False)


    top_lookalikes = similarity_scores.drop(customer_id).head(top_n)

    return top_lookalikes


lookalike_results = {}
target_customers = customers['CustomerID'].head(20).tolist()  # Get the first 20 customer IDs
for customer_id in target_customers:
    lookalikes = get_top_lookalikes(customer_id, similarity_df)
    lookalike_results[customer_id] = [{'customer_id': cust_id, 'score': score} for cust_id, score in lookalikes.items()]

# 5. Creating Lookalike.csv:

import csv

with open('Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['cust_id', 'lookalikes'])
    for cust_id, lookalikes in lookalike_results.items():
        writer.writerow([cust_id, lookalikes])

print("Lookalike model built and recommendations saved to Lookalike.csv")

Lookalike model built and recommendations saved to Lookalike.csv


In [11]:
import pandas as pd

# Read the Lookalike.csv file into a pandas DataFrame
lookalike_df = pd.read_csv('Lookalike.csv')

# Print the DataFrame
print(lookalike_df)

   cust_id                                         lookalikes
0    C0001  [{'customer_id': 'C0097', 'score': 0.392138445...
1    C0002  [{'customer_id': 'C0109', 'score': 0.383668663...
2    C0003  [{'customer_id': 'C0181', 'score': 0.408098377...
3    C0004  [{'customer_id': 'C0053', 'score': 0.361517044...
4    C0005  [{'customer_id': 'C0096', 'score': 0.427318363...
5    C0006  [{'customer_id': 'C0171', 'score': 0.462140894...
6    C0007  [{'customer_id': 'C0020', 'score': 0.490301040...
7    C0008  [{'customer_id': 'C0091', 'score': 0.317707103...
8    C0009  [{'customer_id': 'C0083', 'score': 0.559124507...
9    C0010  [{'customer_id': 'C0094', 'score': 0.466910188...
10   C0011  [{'customer_id': 'C0170', 'score': 0.449673621...
11   C0012  [{'customer_id': 'C0128', 'score': 0.429763248...
12   C0013  [{'customer_id': 'C0141', 'score': 0.321225913...
13   C0014  [{'customer_id': 'C0128', 'score': 0.736704617...
14   C0015  [{'customer_id': 'C0073', 'score': 0.533739806...
15   C00

In [12]:
from google.colab import files

files.download('Lookalike.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>