In [8]:
import pandas as pd
import itertools
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split



# Load datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Dataset Merge
customer_transactions = pd.merge(transactions, customers, on="CustomerID", how="inner")

# Feature engineering:
customer_features = customer_transactions.groupby("CustomerID").agg({
    "Quantity": "sum",  # Total quantity purchased
    "TotalValue": "sum",  # Total transaction value
    "ProductID": "nunique"  # Number of unique products purchased
}).reset_index()

# Add customer profile features
customer_profiles = customers.set_index("CustomerID").drop(columns=["CustomerName"])
customer_features = customer_features.merge(customer_profiles, on="CustomerID", how="left")

# Convert SignupDate to numeric (days since signup)
customer_features["SignupDate"] = pd.to_datetime(customer_features["SignupDate"])
customer_features["DaysSinceSignup"] = (pd.Timestamp.now() - customer_features["SignupDate"]).dt.days
customer_features = customer_features.drop(columns=["SignupDate"])

# Encode categorical columns
le_region = LabelEncoder()
customer_features["Region"] = le_region.fit_transform(customer_features["Region"])

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=["CustomerID"]))

# Create pairwise customer dataset
customer_ids = customer_features["CustomerID"].tolist()
pairs = list(itertools.combinations(customer_ids, 2))
pairwise_data = []

for c1, c2 in pairs:
    vec1 = customer_features[customer_features["CustomerID"] == c1].drop(columns="CustomerID").values.flatten()
    vec2 = customer_features[customer_features["CustomerID"] == c2].drop(columns="CustomerID").values.flatten()
    feature_diff = abs(vec1 - vec2)  # Feature difference between two customers using vectors
    pairwise_data.append({
        "Customer1": c1,
        "Customer2": c2,
        "Similarity": 1 if abs(vec1.sum() - vec2.sum()) < 0.1 * vec1.sum() else 0,  # Similar if differences are small
        **{f"FeatureDiff_{i}": diff for i, diff in enumerate(feature_diff)}
    })

pairwise_df = pd.DataFrame(pairwise_data)

# Split features and target
X = pairwise_df.drop(columns=["Customer1", "Customer2", "Similarity"])
y = pairwise_df["Similarity"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

recommendations = {}

for customer_id in customer_ids[:20]:
    similarities = []
    for other_id in customer_ids:
        if other_id != customer_id:
            vec1 = customer_features[customer_features["CustomerID"] == customer_id].drop(columns="CustomerID").values.flatten()
            vec2 = customer_features[customer_features["CustomerID"] == other_id].drop(columns="CustomerID").values.flatten()
            feature_diff = abs(vec1 - vec2).reshape(1, -1)
            # Create a DataFrame for feature_diff with matching column names
            feature_diff_df = pd.DataFrame(feature_diff, columns=X.columns)
            # Probability of being similar
            similarity_score = rf_model.predict_proba(feature_diff_df)[0][1]
            similarities.append((other_id, round(similarity_score, 4)))

    # Sort by similarity score and select the top 3
    top_3 = sorted(similarities, key=lambda x: x[1], reverse=True)[:3]
    recommendations[customer_id] = top_3

# Save recommendations to CSV
recommendations_df = pd.DataFrame({
    "CustomerID": list(recommendations.keys()),
    "Lookalikes": [str(recommendations[customer]) for customer in recommendations]
})

recommendations_df.to_csv("Lookalike.csv", index=False)

print("Lookalike recommendations CSV generated!")


Lookalike recommendations CSV generated!
