In [36]:
import spacy
import networkx as nx
import numpy as np
import os
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import MessagePassing, global_add_pool
from sklearn.model_selection import train_test_split
import json
import csv
import pandas as pd


### Utility functions

In [30]:
### function to read the files 

def read_file(path, number):
    whole_data = []

    with open(path, 'r') as f:
        for i in range(number):
            line = f.readline()
            data = json.loads(line)
            if 'reviewText' in data:
                value = data['reviewText'].replace('\n', ' ')
                whole_data.append(value)
    
    return whole_data

In [31]:
### function that writes the csv file

def write_csv_file(path, data, number, label):
    data = [x for x in data if len(x.split()) > 10]

    with open(path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['reviewText', 'label'])
        for i in range(number):
            writer.writerow([data[i], label])

    return None

### Generating the database

Fashion, label 0 

In [32]:
path = 'amazon_data/AMAZON_FASHION_5.json' 
fashion_data = read_file(path, 2000)
write_csv_file('multi_class_data/fashion.csv', fashion_data, 1000, 0)

Music, label 1 

In [33]:
path = 'amazon_Data/Digital_Music_5.json'
music_data = read_file(path, 2000)
write_csv_file('multi_class_data/music.csv', music_data, 1000, 1)

Sports, label 2 

In [34]:
path = 'amazon_data/Sports_and_Outdoors_5.json'
sport_data = read_file(path, 2000)
write_csv_file('multi_class_data/sport.csv', sport_data, 1000, 2)

Pet Supplies, label 3

In [35]:
path = 'amazon_data/Pet_Supplies_5.json'
pet_data = read_file(path, 2000)
write_csv_file('multi_class_data/pet.csv', pet_data, 1000, 3)

### Loading the database

In [39]:
df_fashion = pd.read_csv('multi_class_data/fashion.csv')
df_music = pd.read_csv('multi_class_data/music.csv')
df_sport = pd.read_csv('multi_class_data/sport.csv')
df_pet = pd.read_csv('multi_class_data/pet.csv')

df = pd.concat([df_fashion, df_music, df_sport, df_pet], ignore_index=True)
df = df.sample(frac=1, random_state = 42).reset_index(drop=True)

In [40]:
df.head()

Unnamed: 0,reviewText,label
0,the best sneakers by far! I had never owned a ...,0
1,The quality of the items were good and they we...,3
2,"These shoes are extremely comfortable, and fit...",0
3,"They worked extremely well, this is the only p...",3
4,Convenient packaging and reasonable pricing. N...,2


### Preprocessing

In [41]:
# Load a pre-trained word embedding model
nlp = spacy.load('en_core_web_md')

In [42]:
def biagram_depending_on_link(review, label):
    
    weights_each_type = {'ADJ': 3, 'ADV': 2, 'NOUN': 1, 'VERB': 4, 'ADP': 1, 'DET': 1, 'NUM': 1, 'PUNCT': 1, 'PRON': 1, 'PROPN': 1, 'SCONJ': 1, 'SYM': 1, 'X': 1, 'PART': 1, 'CCONJ': 1, 'INTJ': 1, 'AUX': 1, 'SPACE': 1, '': 1}

    ## sentences preprocessing
    doc = nlp(review)
    sentences = [sent for sent in doc.sents]
    sentences = [{token.text.lower() : (token.pos_, token.dep_) for token in sent if not token.is_stop and token.is_alpha} for sent in sentences]

    ## get the biagrams
    biagrams = []
    for sent in sentences:
        for i in range(len(sent)-1):
            biagrams.append((list(sent.keys())[i], list(sent.keys())[i+1]))

    
    ### concatenate all the sentences in one
    sent = {}
    for sentence in sentences:
        sent.update(sentence)

    ### dico of how many times a biagram appears in the review
    dico_biagrams = {}
    for biagram in biagrams:
        if biagram not in dico_biagrams and (biagram[1], biagram[0]) not in dico_biagrams:
            dico_biagrams[biagram] = weights_each_type[sent[biagram[0]][0]] + weights_each_type[sent[biagram[1]][0]]
        elif biagram in dico_biagrams:
            dico_biagrams[biagram] +=  weights_each_type[sent[biagram[0]][0]] + weights_each_type[sent[biagram[1]][0]]
        elif (biagram[1], biagram[0]) in dico_biagrams:
            dico_biagrams[(biagram[1], biagram[0])] += weights_each_type[sent[biagram[0]][0]] + weights_each_type[sent[biagram[1]][0]]
        

    list_of_words = [word for sent in sentences for word in sent]
    list_of_words = list(set(list_of_words))
    ## create graph 
    G = nx.Graph()
    ## nodes as words 
    G.add_nodes_from(list_of_words)

    ## add edges
    for biagram in dico_biagrams.keys():
        G.add_edge(biagram[0], biagram[1], weight = dico_biagrams[biagram])

    
    # Get the node features
    node_features = []
    for node in G.nodes():
        node_features.append(nlp.vocab[node].vector)
    node_features = np.array(node_features)
    # Get the edges
    edges = []
    for edge in G.edges():
        edges.append([list(G.nodes()).index(edge[0]), list(G.nodes()).index(edge[1])])
    edges = np.array(edges)
    ## edge_attr 
    edges_attr  = []
    for edge in G.edges():
        edges_attr.append([G.edges[edge]['weight']])
    edges_attr = np.array(edges_attr)
    
    # Get the label
    label_value = int(label)


    # Create a PyTorch Geometric Data object
    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edges.T, dtype=torch.long)
    y = torch.tensor(label_value, dtype=torch.float)
    edge_attr = torch.tensor(edges_attr, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, edge_attr = edge_attr, y=y)
    
    return data

In [56]:
list_of_reviews = [] 

for i in range(800):
    data = biagram_depending_on_link(df['reviewText'][i], df['label'][i])
    list_of_reviews.append(data)

In [57]:
list_of_reviews[0]

Data(x=[11, 300], edge_index=[2, 9], edge_attr=[9, 1], y=0.0)

In [58]:
train_data, test_data = train_test_split(list_of_reviews, test_size=0.2, random_state=42)

In [60]:
for i,review in enumerate(train_data):
    if review.edge_index.shape < torch.Size([2]):
        print(review.edge_index.shape)
        print(i)
        print(review.y)
        print(review.x.shape)
        train_data.pop(i)
print(len(train_data))

for i,review in enumerate(test_data):
    if review.edge_index.shape < torch.Size([2]):
        print(review.edge_index.shape)
        test_data.pop(i)

torch.Size([0])
263
tensor(3.)
torch.Size([7, 300])
639


### is the data balanced?

In [62]:
def get_numbers(data):
    labels = [d.y.item() for d in data]
    fashion = labels.count(0)   
    music = labels.count(1)
    sport = labels.count(2)
    pet = labels.count(3)

    return fashion, music, sport, pet

fashion, music, sport, pet = get_numbers(train_data)

print(f'Number of fashion reviews: {fashion}')
print(f'Number of music reviews: {music}')
print(f'Number of sport reviews: {sport}')
print(f'Number of pet reviews: {pet}')


Number of fashion reviews: 160
Number of music reviews: 175
Number of sport reviews: 160
Number of pet reviews: 144


In [63]:

import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, global_add_pool, global_mean_pool  

'''
Graph SAGE: SAmpling and aggreGatE, 
Samples only a subset of neighboring nodes at different depth layers, 
and then the aggregator takes neighbors of the previous layers and aggregates them
'''
class GraphSAGE(torch.nn.Module):
  """GraphSAGE"""
  def __init__(self, num_node_features, hidden_dim, num_classes):
    super().__init__()
    self.sage1 = SAGEConv(num_node_features, hidden_dim*2)
    self.sage2 = SAGEConv(hidden_dim*2, hidden_dim)
    self.sage3 = SAGEConv(hidden_dim, hidden_dim)
    self.sage4 = SAGEConv(hidden_dim, num_classes)
    self.optimizer = torch.optim.Adam(self.parameters(),
                                      lr=0.0001,
                                        weight_decay=5e-4)
                                      
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  def forward(self, x, edge_index):
    ## layer 1 
    h = self.sage1(x, edge_index)
    h = torch.relu(h)
    h = F.dropout(h, p=0.2, training=self.training)

    ## layer 2

    h = self.sage2(h, edge_index)
    h = torch.relu(h)
    h = F.dropout(h, p=0.2, training=self.training)

    # layer 3 
    h = self.sage3(h, edge_index)
    h = torch.relu(h)
    h = F.dropout(h, p=0.5, training=self.training)

     # layer 4
    h = self.sage3(h, edge_index)
    h = torch.relu(h)
    h = F.dropout(h, p=0.5, training=self.training)

     # layer 5
    h = self.sage3(h, edge_index)
    h = torch.relu(h)
    h = F.dropout(h, p=0.2, training=self.training)

    ## layer 6
    h = self.sage4(h, edge_index)
    h = global_mean_pool(h, torch.zeros(h.size(0), dtype=torch.long).to(self.device))
    return h, F.log_softmax(h, dim=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



def train(model, loader, device):
    model.train()
    optimizer = model.optimizer
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        _, out = model(data.x.float(), data.edge_index)
        loss = F.nll_loss(out, data.y.long())
        loss.backward()
        total_loss += loss.item() * data.num_graphs
        optimizer.step()
    return total_loss / len(loader.dataset)
    
    
# Define the testing loop
def test(model, loader, device):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        _, out = model(data.x.float(), data.edge_index)
        pred = out.argmax(dim=1)
        correct += pred.eq(data.y.long()).sum().item()
    return correct / len(loader.dataset)



# Define the model and optimizer
model = GraphSAGE(300, 300, 4).to(device)

# Train the model
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)


for epoch in range(0, 10):
    loss = train(model, train_loader, device)
    train_acc = test(model, train_loader, device)
    test_acc = test(model, test_loader, device)
    print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}", f"Test Acc: {test_acc:.4f}")


print(f'\nGraphSage test accuracy: {test(model, test_loader, device)*100:.2f}%\n')



Epoch: 000, Loss: 1.2134, Train Acc: 0.6041 Test Acc: 0.6000
Epoch: 001, Loss: 0.5845, Train Acc: 0.8811 Test Acc: 0.8562
Epoch: 002, Loss: 0.2824, Train Acc: 0.9452 Test Acc: 0.9000
Epoch: 003, Loss: 0.1652, Train Acc: 0.9609 Test Acc: 0.9125
Epoch: 004, Loss: 0.1215, Train Acc: 0.9875 Test Acc: 0.8938
Epoch: 005, Loss: 0.0816, Train Acc: 0.9969 Test Acc: 0.9187
Epoch: 006, Loss: 0.0586, Train Acc: 0.9953 Test Acc: 0.9062
Epoch: 007, Loss: 0.0455, Train Acc: 0.9937 Test Acc: 0.9125
Epoch: 008, Loss: 0.0601, Train Acc: 0.9906 Test Acc: 0.9000
Epoch: 009, Loss: 0.0147, Train Acc: 1.0000 Test Acc: 0.8938

GraphSage test accuracy: 89.38%

