In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import networkx as nx
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.image as mpimg
import random

In [None]:
dataset = pd.read_excel('/content/vitrina_clusters.xlsx')
#dataset = dataset[:400]

In [None]:
dataset.head()

In [None]:
dataset["cluster"].value_counts()

cluster
0    75394
5    12730
3      697
4       60
2        6
1        1
Name: count, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
df1 = dataset[dataset['cluster'] == 0]
#df2 = dataset[dataset['cluster'] == 1]
#df3 = dataset[dataset['cluster'] == 2]
df4 = dataset[dataset['cluster'] == 3]
df5 = dataset[dataset['cluster'] == 4]
df6 = dataset[dataset['cluster'] == 5]

X1_train, X1_test = train_test_split(df1,test_size=0.2, random_state=42)
#X2_train, X2_test = train_test_split(df2,test_size=0.2, random_state=42)
#X3_train, X3_test = train_test_split(df3,test_size=0.2, random_state=42)
X4_train, X4_test = train_test_split(df4,test_size=0.2, random_state=42)
X5_train, X5_test = train_test_split(df5,test_size=0.2, random_state=42)
X6_train, X6_test = train_test_split(df6,test_size=0.2, random_state=42)

In [None]:
def creating_graph(dataset):
    G = nx.Graph()
    node2index = {}
    node_counter = 0

    some_unique_id = "000-000"
    G.add_node(node_counter,type='user') #- вершина для тестового набора данных
    node2index[some_unique_id] = node_counter
    node_counter += 1

    items = ['%TN_Автотовары', '%TN_Аксессуары', '%TN_Детские товары', '%TN_Игры, софт и развлечения', '%TN_Климат', '%TN_Крупная бытовая техника', '%TN_Мебель', '%TN_Мелкая бытовая техника', '%TN_Сделай сам', '%TN_Спорт и активный отдых', '%TN_ТВ-Аудио', '%TN_Товары для дома', '%TN_Услуги', '%TN_Хобби, досуг', '%TN_Цифровая Техника', '%TN_Элитная техника']

    for i in items:
        G.add_node(node_counter, type='item')
        node2index[i] = node_counter
        node_counter += 1

    for index, row in dataset.iterrows():
        G.add_node(node_counter, type='user')
        node2index[row['Phone_new']] = node_counter
        for i in items:
            G.add_edge(node_counter, node2index[i], weight=row[i])
        node_counter += 1

    return G, node2index


In [None]:
def biased_random_walk(G, start_node, walk_length, p=1, q=1):
    walk = [start_node]

    while len(walk) < walk_length:
        cur_node = walk[-1]
        cur_neighbors = list(G.neighbors(cur_node))

        if len(cur_neighbors) > 0:
            if len(walk) == 1:
                walk.append(random.choice(cur_neighbors))
            else:
                prev_node = walk[-2]

                probability = []
                for neighbor in cur_neighbors:
                    if neighbor == prev_node:
                        # Return parameter
                        probability.append(1/p)
                    elif G.has_edge(neighbor, prev_node):
                        # Stay parameter
                        probability.append(1)
                    else:
                        # In-out parameter
                        probability.append(1/q)

                probability = np.array(probability)
                probability = probability / probability.sum()  # normalize

                next_node = np.random.choice(cur_neighbors, p=probability)
                walk.append(next_node)
        else:
            break

    return walk

In [None]:
def generate_walks(G, num_walks, walk_length, p=1, q=1):
    walks = []
    nodes = list(G.nodes())
    for _ in range(num_walks):
        random.shuffle(nodes)  # to ensure randomness
        for node in nodes:
            walk_from_node = biased_random_walk(G, node, walk_length, p, q)
            walks.append(walk_from_node)
    return walks

In [None]:
#!pip install pygsp

In [None]:
def creating_model(G):
    walks = generate_walks(G, num_walks=10, walk_length=20, p=9, q=1)
    filtered_walks = [walk for walk in walks if len(walk) >= 5]

    # to String  (for Word2Vec input)
    walks = [[str(node) for node in walk] for walk in walks]

    # Word2Vec train
    model = Word2Vec(walks, vector_size=128, window=5, min_count=0,  hs=1, sg=1, workers=4, epochs=10)

    # node embedding extract
    embeddings = {node_id: model.wv[node_id] for node_id in model.wv.index_to_key}

    return model

In [None]:
G1, node_index1 = creating_graph(X1_train)
G4, node_index4 = creating_graph(X4_train)
G5, node_index5 = creating_graph(X5_train)
G6, node_index6 = creating_graph(X6_train)

In [None]:
#model1 = creating_model(G1)
#model4 = creating_model(G4)
#model5 = creating_model(G5)
model6 = creating_model(G6)

In [None]:
#embeddings1 = {node_id: model1.wv[node_id] for node_id in model1.wv.index_to_key}
#embeddings4 = {node_id: model4.wv[node_id] for node_id in model4.wv.index_to_key}
#embeddings5 = {node_id: model5.wv[node_id] for node_id in model5.wv.index_to_key}
embeddings6 = {node_id: model6.wv[node_id] for node_id in model6.wv.index_to_key}

In [None]:
def get_user_embedding(user_id, embeddings):
    return embeddings[str(user_id)]

In [None]:
def get_rated_items(user_id, df, item, items):
    return df[df['Phone_new'] == user_id][item]

In [None]:
def calculate_similarities(user_id, df, embeddings, items):
    for item in items:
        rated_items = get_rated_items(user_id, df, item, items)
    user_embedding = get_user_embedding(user_id, embeddings)

    item_similarities = []
    for item_id in items:
        if item_id not in rated_items:
            item_embedding = embeddings[item_id]
            similarity = cosine_similarity([user_embedding], [item_embedding])[0][0]
            item_similarities.append((item_id, similarity))

    return item_similarities

In [None]:
def recommend_items(user_id, df, embeddings, items, num_items=5):
    for item in items:
        rated_items = get_rated_items(user_id, df, item, items)

    #print(f"User {user_id} has purchased:")
    #print(rated_items)

    item_similarities = calculate_similarities(user_id, df, embeddings, items)

    recommended_items = sorted(item_similarities, key=lambda x: x[1], reverse=True)[:num_items]

    #print(f"\nRecommended items for user {user_id}:")
    #print(recommended_items)
    return recommended_items

In [None]:
items = ['%TN_Автотовары', '%TN_Аксессуары', '%TN_Детские товары', '%TN_Игры, софт и развлечения', '%TN_Климат', '%TN_Крупная бытовая техника', '%TN_Мебель', '%TN_Мелкая бытовая техника', '%TN_Сделай сам', '%TN_Спорт и активный отдых', '%TN_ТВ-Аудио', '%TN_Товары для дома', '%TN_Услуги', '%TN_Хобби, досуг', '%TN_Цифровая Техника', '%TN_Элитная техника']


In [None]:
#recommends1 = recommend_items(0, df1, embeddings1, items, num_items=16)
#recommends4 = recommend_items(0, df4, embeddings4, items, num_items=16)
#recommends5 = recommend_items(0, df5, embeddings5, items, num_items=16)
recommends6 = recommend_items(0, df6, embeddings6, items, num_items=16)

In [None]:
recommends1, recommends4, recommends5, recommends6

In [None]:
X1_test[items]

In [None]:
relevance_score1 = [x[1] for x in recommends1]
relevance_score1 = relevance_score1
relevance_score1

In [None]:
def map_at_k(scores, true_relevance, k):

    num_users = len(scores)
    ap_scores = []

    for i in range(num_users):
        # Sort the predicted scores by value
        sorted_scores = sorted(enumerate(scores[i]), key=lambda x: x[1], reverse=True)
        # Get the top k predicted items
        top_k_predicted_items = [x[0] for x in sorted_scores[:k]]
        # Calculate the average precision
        num_relevant_items = sum(1 for j in range(len(true_relevance[i])) if true_relevance[i][j] > 0)
        relevant_count = 0
        ap = 0
        for j in range(k):
            if true_relevance[i][top_k_predicted_items[j]] > 0:
                relevant_count += 1
                ap += relevant_count / (j + 1)
        ap_scores.append(ap / min(num_relevant_items, k))

    # Calculate the MAP@K score
    map_k = np.mean(ap_scores)
    return map_k

In [None]:
maps = []
for i in range(len(X1_test)):
    true_relevance = X1_test[items].iloc[i].values.tolist()
    for i in range(len(true_relevance)):
        if true_relevance[i] > 0:
            true_relevance[i] = 1
    map_k = map_at_k([relevance_score1], [true_relevance], 10)
    maps.append(map_k)
# for i in range(len(maps)):
#     if maps[i] == 0:
#         maps[i] = 1
np.mean(maps)

NN GAT GCN


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import networkx as nx
import scipy.sparse as sp

class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.concat = concat
        self.dropout = dropout

        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)

        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, input, adj):
        h = torch.matmul(input, self.W)
        N = h.size()[0]

        a_input = torch.cat([h.repeat(1, N).view(N*N, -1), h.repeat(N, 1)], dim=1).view(N, -1, 2*self.out_features)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))

        zero_vec = -9e15*torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, h)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

class GAT(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout=0.6, alpha=0.2, nheads=8):
        super(GAT, self).__init__()
        self.dropout = dropout

        self.attentions = [GraphAttentionLayer(nfeat, nhid, dropout=dropout, alpha=alpha, concat=True) for _ in range(nheads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(nhid*nheads, nclass, dropout=dropout, alpha=alpha, concat=False)

    def forward(self, x, adj):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.elu(self.out_att(x, adj))
        return F.log_softmax(x, dim=1)

# Example usage
G = G6
# Generate a random graph using NetworkX
adj = nx.adjacency_matrix(G)

# Convert adjacency matrix to a PyTorch dense tensor
adj_tensor = torch.FloatTensor(adj.todense())

# Define node features (random in this example)
node_features = torch.randn(G.number_of_nodes(), 16) # Use the number of nodes from your graph

# Initialize the GAT model
model = GAT(nfeat=16, nhid=8, nclass=2, dropout=0.6, alpha=0.2, nheads=8)

# Forward pass

embeddings = model(node_features, adj_tensor)
embeddings_dict = {node_id: embeddings[index].detach().numpy() for node_id, index in node_index6.items()}


from sklearn.metrics.pairwise import cosine_similarity

def recommend_items(user_id, embeddings_dict, top_n=5):
    user_embedding = embeddings_dict[user_id]
    item_embeddings = [embedding for node_id, embedding in embeddings_dict.items() if node_id != user_id and node_id.startswith('%')]

    similarities = cosine_similarity([user_embedding], item_embeddings)
    top_indices = np.argsort(similarities[0])[::-1][:top_n]

    recommended_items = [(list(embeddings_dict.keys())[i], similarities[0][i]) for i in top_indices]
    return recommended_items

# Example usage
user_id = '000-000'
recommended_items = recommend_items(user_id, embeddings_dict)

maps = []
for i in range(len(X5_test)):
    true_relevance = X5_test[items].iloc[i].values.tolist()
    for i in range(len(true_relevance)):
        if true_relevance[i] > 0:
            true_relevance[i] = 1
    map_k = map_at_k([recommended_items], [true_relevance], 5)
    maps.append(map_k)
# for i in range(len(maps)):
#     if maps[i] == 0:
#         maps[i] = 1
np.mean(maps)




In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_items(user_id, embeddings_dict, top_n=10):
    user_embedding = embeddings_dict[user_id]
    item_embeddings = [embedding for node_id, embedding in embeddings_dict.items() if node_id != user_id and node_id.startswith('%')]

    similarities = cosine_similarity([user_embedding], item_embeddings)
    top_indices = np.argsort(similarities[0])[::-1][:top_n]

    recommended_items = [(list(embeddings_dict.keys())[i], similarities[0][i]) for i in top_indices]
    return recommended_items

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import networkx as nx
import scipy.sparse as sp


class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphConvolution, self).__init__()
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        nn.init.xavier_uniform_(self.weight)

    def forward(self, input, adj):
        support = torch.matmul(input, self.weight)
        output = torch.matmul(adj, support)
        return output

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)



Gr = G6
adj = nx.adjacency_matrix(Gr)

# Convert adjacency matrix to a PyTorch sparse tensor
adj_tensor = torch.FloatTensor(adj.todense())

# Define node features (random in this example)
node_features = torch.randn(Gr.number_of_nodes(), 16) # Use the number of nodes from your graph

# Initialize the GCN model
model = GCN(nfeat=16, nhid=8, nclass=2)

# Forward pass to obtain embeddings
embeddings = model(node_features, adj_tensor)

embeddings_dict = {node_id: embeddings[index].detach().numpy() for node_id, index in node_index6.items()}

user_id = "000-000"
recommended_items = recommend_items(user_id, embeddings_dict)
recommended_items

In [None]:
maps = []
for i in range(len(X1_test)):
    true_relevance = X1_test[items].iloc[i].values.tolist()
#     for i in range(len(true_relevance)):
#         if true_relevance[i] > 0:
#             true_relevance[i] = 1
    map_k = map_at_k([recommended_items], [true_relevance], 10)
    maps.append(map_k)
# for i in range(len(maps)):
#     if maps[i] == 0:
#         maps[i] = 1
np.mean(maps)