In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
!pip install node2vec --progress-bar off

In [None]:
vitrina = pd.read_excel('/Users/diananigmatullina/Downloads/vitrina_clusters.xlsx')

# Creating graph

In [None]:
from sklearn.neighbors import NearestNeighbors
import networkx as nx


# Create a graph
G = nx.Graph()
window = vitrina

# Add nodes (customers) to the graph
for idx, row in window.iterrows():
    customer_id = idx
    G.add_node(customer_id)  # Add node without cluster attribute
    G.nodes[customer_id]['cluster'] = row['cluster'] 



n_neighbors = 10

# Loop through each unique cluster ID
for cluster_id in window['cluster'].unique():
    # Get the indices of customers in the current cluster
    cluster_customers = window[window['cluster'] == cluster_id].index
    
    # Ensure there are enough customers in the cluster
    if len(cluster_customers) >= n_neighbors:
        # Extract features
        cluster_features = window.loc[cluster_customers, ['Number of bills', 'Average bill', 
                                                          'Average number of goods in bill', 
                                                          'Revenue', 'Discount']]
        
        # Initialize NearestNeighbors model
        nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='euclidean')
        nbrs.fit(cluster_features)
        
        # Find nearest neighbors for each customer
        distances, indices = nbrs.kneighbors(cluster_features)
        
        # Add edges between nearest neighbors
        for i, customer_index in enumerate(cluster_customers):
            for neighbor_index in indices[i]:
                if customer_index != cluster_customers[neighbor_index]:
                    G.add_edge(customer_index, cluster_customers[neighbor_index])
    else:
        print(f"Not enough customers in cluster {cluster_id} to find {n_neighbors} neighbors.")


In [None]:
import pandas as pd
import networkx as nx
from node2vec import Node2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import random

# Sample a subset of nodes and edges
sampled_nodes = random.sample(G.nodes(), k=1000)
sampled_edges = random.sample(G.edges(), k=2000)

# Create a new graph with the sampled nodes and edges
subgraph = nx.Graph()
subgraph.add_nodes_from(sampled_nodes)
subgraph.add_edges_from(sampled_edges)

# Node2vec

In [None]:
node2vec = Node2Vec(subgraph, dimensions=64, walk_length=30, num_walks=200, workers=4)
model = node2vec.fit()

In [None]:
node_embeddings = {}
for node in subgraph.nodes():
    if node in model.wv:
        node_embeddings[node] = model.wv[node]
    else:
        print(f"Node {node} is not present in the embeddings model.")

# Use only the embeddings for nodes present in the model
node_embeddings = {node: model.wv[node] for node in node_embeddings.keys()}

In [None]:
# Use labels from the original window dataframe
labels = window['cluster']

# Filter node embeddings and labels to include only nodes with labels
valid_nodes = set(labels.index)
filtered_node_embeddings = {node: embedding for node, embedding in node_embeddings.items() if node in valid_nodes}
filtered_labels = labels.loc[list(filtered_node_embeddings.keys())]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(list(filtered_node_embeddings.values()), filtered_labels, test_size=0.2, random_state=42)


# Train a classifier (e.g., logistic regression) on the node embeddings
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Step 4: Generate Recommendations (Revised)
def generate_recommendations(model, user_embeddings, items, k=10):
    # Predict scores for items
    item_scores = model.predict_proba(user_embeddings)[:, 1]  # Assuming positive class index is 1
    
    # Sort items based on scores
    ranked_items = sorted(zip(items, item_scores), key=lambda x: x[1], reverse=True)
    
    # Get top-k recommendations
    top_recommendations = ranked_items[:k]
    
    return top_recommendations


# Initialize lists to store recommendations and ground truth
all_recommendations = []
all_ground_truth = []

# Iterate over each user in the test set
for user_index in range(len(X_test)):
    # Generate recommendations for the current user
    user_embedding = X_test[user_index].reshape(1, -1)
    recommendations = generate_recommendations(classifier, user_embedding, list(range(num_items)), k=10)
    
    # Append recommendations to the list
    all_recommendations.append([item for item, _ in recommendations])
    
    # Get ground truth for the current user
    ground_truth_labels = window.loc[y_test.index[user_index], window.columns[7:]]  # Assuming item columns start from index 7
    ground_truth = [item for item, count in ground_truth_labels.items() if count > 0]
    
    # Append ground truth to the list
    all_ground_truth.append(ground_truth)

In [None]:
from sklearn.metrics import average_precision_score, ndcg_score

# Calculate Mean Average Precision (MAP)
average_precisions = []
for ground_truth, recommendations in zip(all_ground_truth, all_recommendations):
    y_true = [1 if item in ground_truth else 0 for item in recommendations]
    y_score = [1 if item in ground_truth else 0 for item in recommendations]
    average_precisions.append(average_precision_score(y_true, y_score))
mean_average_precision = np.mean(average_precisions)

# Print or use the calculated metrics as needed
print("Mean Average Precision (MAP):", mean_average_precision)


# Deep walk

In [None]:
import networkx as nx
from gensim.models import Word2Vec

import random

def random_walk(graph, node, walk_length):
    walk = [node]
    for _ in range(walk_length - 1):
        neighbors = list(graph.neighbors(walk[-1]))
        if neighbors:
            walk.append(random.choice(neighbors))
        else:
            break
    return walk

walks = []
num_walks = 10
walk_length = 80
G = subgraph
for node in G.nodes():
    for _ in range(num_walks):
        walks.append(random_walk(G, node, walk_length))


Embedding

In [None]:
from gensim.models import Word2Vec

# Train Word2Vec model
embedding_size = 100
window_size = 10
model = Word2Vec(walks, vector_size=embedding_size, window=window_size, min_count=0, sg=1, workers=4)

# Get node embeddings
node_embeddings = {node: model.wv[node] for node in G.nodes()}

RF

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Extract features and labels
features = ['Number of bills', 'Average bill', 'Average number of goods in bill', 'Revenue', 'Discount']
X = window[features]

# Convert columns representing certain items into binary labels
items_columns = [
    '%TN_Автотовары', '%TN_Аксессуары', '%TN_Детские товары', '%TN_Игры, софт и развлечения',
    '%TN_Хобби, досуг', '%TN_Цифровая Техника', '%TN_Элитная техника'
]
y = window[items_columns]
y = (y > 0).any(axis=1).astype(int)  
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict whether a person will buy a certain item
y_pred = rf_classifier.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

Building recommendations

In [None]:
# Assuming you have already defined and trained your random forest classifier (rf_classifier) and generated node embeddings and node order

def make_predictions_per_customer(node_embeddings, rf_classifier, node_order):
    predictions_per_customer = {}
    for node in node_order:
        if node in node_embeddings:
            # Retrieve embedding for the current node
            embedding = node_embeddings[node]
            # Ensure the embedding has the correct shape
            embedding = np.reshape(embedding, (1, -1))  # Reshape to match the expected shape
            # Use the trained random forest classifier to make predictions for this customer
            predictions = rf_classifier.predict_proba(embedding)
            predictions_per_customer[node] = predictions
        else:
            print(f"Warning: No embedding found for node {node}")
            predictions_per_customer[node] = None
    return predictions_per_customer

# Step 4: Generate Recommendations
def generate_recommendations(predictions_per_customer, item_names, top_n=5):
    recommendations_per_customer = {}
    for node, predictions in predictions_per_customer.items():
        if predictions is not None:
            # Sort predicted probabilities in descending order and get indices
            top_indices = np.argsort(predictions[0])[::-1][:top_n]
            # Get the top recommended items
            top_recommendations = [(item_names[i], predictions[0][i]) for i in top_indices]
            recommendations_per_customer[node] = top_recommendations
        else:
            recommendations_per_customer[node] = None
    return recommendations_per_customer

node_order = sorted(subgraph.nodes())

predictions_per_customer = make_predictions_per_customer(node_embeddings, rf_classifier, node_order)

recommendations_per_customer = generate_recommendations(predictions_per_customer, item_names, top_n=5)

# Print recommendations for each customer
for node, recommendations in recommendations_per_customer.items():
    if recommendations is not None:
        print(f"Recommendations for customer {node}:")
        for rank, (item, probability) in enumerate(recommendations):
            print(f"{rank+1}. {item} (Probability: {probability:.4f})")
    else:
        print(f"No recommendations available for customer {node}")


In [None]:
relevance_scores = []
recommended_items = []

# Define the number of recommendations to generate for each customer
num_recommendations = 3  

# Iterate over each customer
for customer_id in subgraph.nodes():
    # Generate recommendations for the customer
    recommendations = generate_recommendations(customer_id, num_recommendations)
    
    # Extract the ground truth (items actually purchased by the customer)
    ground_truth = window[window['Phone_new'] == customer_id][items_columns].values.flatten().tolist()
    
    # Prepare relevance scores for the customer's purchased items
    relevance = [1 if item in ground_truth else 0 for item in recommended_items]
    relevance_scores.append(relevance)
    
    # Extract recommended items
    recommended_items.append([item[0] for item in recommendations])

In [None]:
# Compute Mean Average Precision (MAP)
from sklearn.metrics import average_precision_score, ndcg_score

maps = []
for relevance in relevance_scores:
    if relevance:
        map_score = average_precision_score(relevance, range(1, len(relevance) + 1))
        maps.append(map_score)
    else:
        maps.append(0)

mean_map = sum(maps) / len(maps)

In [None]:
print("Mean Average Precision (MAP):", mean_map)

In [None]:
# Compute Normalized Discounted Cumulative Gain (NDCG)
ndcgs = []
for relevance in relevance_scores:
    if relevance:
        ndcg_score = ndcg_score([relevance], [range(1, len(relevance) + 1)])
        ndcgs.append(ndcg_score)
    else:
        # If relevance list is empty, append 0
        ndcgs.append(0)

mean_ndcg = sum(ndcgs) / len(ndcgs)
print("Mean Normalized Discounted Cumulative Gain (NDCG):", mean_ndcg)