In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
customers = pd.read_csv('/content/drive/MyDrive/zeotap-data/Customers.csv')
transactions = pd.read_csv('/content/drive/MyDrive/zeotap-data/Transactions.csv')

In [None]:
customers.columns

Index(['CustomerID', 'CustomerName', 'Region', 'SignupDate'], dtype='object')

In [None]:
transactions.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price'],
      dtype='object')

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_region = encoder.fit_transform(customers[['Region']])
encoded_region_df = pd.DataFrame(encoded_region, columns=encoder.get_feature_names_out(['Region']))
customers = pd.concat([customers, encoded_region_df], axis=1)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer

customers = pd.read_csv('/content/drive/MyDrive/zeotap-data/Customers.csv')
transactions = pd.read_csv('/content/drive/MyDrive/zeotap-data/Transactions.csv')

agg_transactions = transactions.groupby('CustomerID').agg(
    total_spending=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    purchase_frequency=pd.NamedAgg(column='TotalValue', aggfunc='count'),
    avg_transaction_value=pd.NamedAgg(column='TotalValue', aggfunc='mean')
).reset_index()


customers = pd.merge(customers, agg_transactions, on='CustomerID', how='left')


In [None]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupMonth'] = customers['SignupDate'].dt.month
customers['SignupYear'] = customers['SignupDate'].dt.year

# scaler = StandardScaler()
# customers[['SignupMonth', 'SignupYear', 'total_spending', 'purchase_frequency', 'avg_transaction_value']] = scaler.fit_transform(
#     customers[['SignupMonth', 'SignupYear', 'total_spending', 'purchase_frequency', 'avg_transaction_value']]
# )

# imputer = SimpleImputer(strategy='mean')
imputer = SimpleImputer(strategy='median')
customers[['total_spending', 'purchase_frequency', 'avg_transaction_value']] = imputer.fit_transform(
    customers[['total_spending', 'purchase_frequency', 'avg_transaction_value']])
# )

# scaler = RobustScaler()
# customers[['SignupMonth', 'SignupYear', 'total_spending', 'purchase_frequency', 'avg_transaction_value']] = scaler.fit_transform(
#     customers[['SignupMonth', 'SignupYear', 'total_spending', 'purchase_frequency', 'avg_transaction_value']])

In [None]:
customers = pd.get_dummies(customers, columns=['Region'], prefix=['Region'])

numerical_features = customers.drop(columns=['CustomerID', 'CustomerName', 'SignupDate']).select_dtypes(include=['number'])
numerical_features = numerical_features.fillna(numerical_features.mean())
similarity_matrix = cosine_similarity(numerical_features)

lookalike_map = {}
for idx, customer_id in enumerate(customers['CustomerID']):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = similarity_scores[1:4]

    lookalike_map[customer_id] = [(customers.loc[i, 'CustomerID'], round(score, 2)) for i, score in top_3]

lookalike_df = pd.DataFrame([
    {'Customer_id': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])

lookalike_df = lookalike_df[lookalike_df['Customer_id'].isin([f"C{i:04}" for i in range(1, 21)])]

lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike.csv has been created successfully!")


Lookalike.csv has been created successfully!


In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
# from sklearn.metrics.pairwise import cosine_similarity

In [None]:
X = customers.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])
X.fillna(X.median(),inplace=True)
X.isnull().sum()
# X.dropna(inplace=True)
X.fillna(X.mean(),inplace=True)
# silhouette_score(X, kmeans.labels_
# similarity_matrix = cosine_similarity(X)

In [None]:
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
customers_labels = kmeans.fit_predict(X)

In [None]:
silhoutte = silhouette_score(X, customers_labels)
print(silhoutte)

0.49650969937144607
