# Imports

In [1]:
import spacy
import networkx as nx
import numpy as np
import os
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import MessagePassing, global_add_pool
from sklearn.model_selection import train_test_split
import pandas as pd 


## Important elements 

In [2]:
# pre-trained word embedding model
nlp = spacy.load('en_core_web_md')

# Loading data

In [4]:
df = pd.read_csv('data/train.csv')
df = df.sample(frac=1, random_state = 42).reset_index(drop=True)
df.head()

Unnamed: 0,review,label
0,This movie is another Christian propaganda fil...,0.0
1,A woman who hates cats (Alice Krige) and her s...,1.0
2,"Beast Wars is a show that is over-hyped, overp...",0.0
3,"An excellent example of ""cowboy noir"", as it's...",1.0
4,"Ok, basically this is a popcorn sci-fi movie, ...",1.0


# Preprocessing 

In [6]:
def preprocess_review(review, label, graph_visu=False):

    ## preprocess of sentences to have a graph that "sees" sentences 

    doc = nlp(review)
    sentences = [sent for sent in doc.sents]

    sentences = [[token.text for token in sent] for sent in sentences]
    sentences = [[word for word in sent if word.isalpha()] for sent in sentences]
    sentences = [[word for word in sent if not nlp.vocab[word].is_stop] for sent in sentences]


    sentences = [sent for sent in sentences if len(sent) > 0]

    dico_words = {}
    for i,sent in enumerate(sentences):
        for word in sent:
            if word not in dico_words:
                dico_words[word] = [i]
            else:
                dico_words[word] += [i]


    G = nx.Graph()
    G.add_nodes_from(dico_words.keys())

    # print(sentences)
    ## Linking sentences in the graph 
    for i in range(len(sentences)-1):
        previous_word = sentences[i][-1]
        next_word = sentences[i+1][0]
        if previous_word != next_word:
            G.add_edge(previous_word, next_word, weight = 1)

    for word in dico_words.keys():
        for word2 in dico_words.keys():
            if word != word2:
                common_sentences = set(dico_words[word]).intersection(set(dico_words[word2]))
                if len(common_sentences) > 0:
                    G.add_edge(word, word2, weight = 1+len(common_sentences))

    if graph_visu:
        plt.figure(figsize=(10,10))
        nx.draw(G, with_labels=True, font_weight='bold')
        plt.show()

    # For the GNN : 
    # nodes 
    node_features = []
    for node in G.nodes():
        node_features.append(nlp.vocab[node].vector)
    node_features = np.array(node_features)
    # edges
    edges = []
    for edge in G.edges():
        edges.append([list(G.nodes()).index(edge[0]), list(G.nodes()).index(edge[1])])
    edges = np.array(edges)
    # label
    label = [int(label)]


    # Create a PyTorch Geometric Data object
    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edges.T, dtype=torch.long)
    y = torch.tensor(label, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, y=y)

    return data


In [7]:
### Get only 200 reviews 
list_of_reviews = [] 

for i in range(200):
    data = preprocess_review(df['review'][i], df['label'][i])
    list_of_reviews.append(data)
    

In [8]:
list_of_reviews[0]

Data(x=[57, 300], edge_index=[2, 228], y=[1])

### Data split

In [9]:
### Split the data into training and test sets 
# ( Yes I know that I am doing it on the 200 reviews of the training file but it was just a small test to make sure that the graphs were compatible ) 
train_data, test_data = train_test_split(list_of_reviews, test_size=0.2, random_state=42)

In [10]:
### Is the data balanced ?

def get_num_pos_neg(data):
    labels = [d.y.item() for d in data]
    pos = labels.count(1)
    neg = labels.count(0)

    return pos, neg

pos,neg = get_num_pos_neg(train_data)

print(f'Number of positive reviews in the training set: {pos}')
print(f'Number of negative reviews in the training set: {neg}')


Number of positive reviews in the training set: 74
Number of negative reviews in the training set: 86


# Model

In [11]:
### This model is juste an exemple to test the graphs
class GraphNet(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(GraphNet, self).__init__(aggr="add")
        self.lin = torch.nn.Linear(in_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.lin(x)
        return self.propagate(edge_index, x=x)

    def message(self, x_j):
        return x_j

    def update(self, aggr_out):
        return aggr_out

class Classifier(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(Classifier, self).__init__()
        self.conv1 = GraphNet(in_channels, hidden_channels)
        self.conv2 = GraphNet(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        x = global_add_pool(x, torch.zeros(x.size(0), dtype=torch.long))
        return F.log_softmax(x, dim=1)

# Define the model and optimizer
model = Classifier(300, 128, 2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Define the training loop
def train(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x.float(), data.edge_index)
        loss = F.nll_loss(out, data.y.long())
        loss.backward()
        total_loss += loss.item() * data.num_graphs
        optimizer.step()
    return total_loss / len(loader.dataset)
    
    
# Define the testing loop
def test(model, loader, device):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(device)
        out = model(data.x.float(), data.edge_index)
        pred = out.argmax(dim=1)
        correct += pred.eq(data.y.long()).sum().item()
    return correct / len(loader.dataset)


# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_loader = DataLoader(train_data, batch_size=1 , shuffle=True)
test_loader = DataLoader(test_data, batch_size=1 , shuffle=False)
for epoch in range(1, 30):
    loss = train(model, train_loader, optimizer, device)
    train_acc = test(model, train_loader, device)
    test_acc = test(model, test_loader, device)
    print(f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}", f"Test Acc: {test_acc:.4f}")





Epoch: 001, Loss: 92.8282, Train Acc: 0.7125 Test Acc: 0.6000
Epoch: 002, Loss: 34.5446, Train Acc: 0.5938 Test Acc: 0.5500
Epoch: 003, Loss: 66.2533, Train Acc: 0.5188 Test Acc: 0.5500
Epoch: 004, Loss: 27.8972, Train Acc: 0.8000 Test Acc: 0.5500
Epoch: 005, Loss: 39.8170, Train Acc: 0.8313 Test Acc: 0.6000
Epoch: 006, Loss: 14.9998, Train Acc: 0.9000 Test Acc: 0.5750
Epoch: 007, Loss: 32.1674, Train Acc: 0.8875 Test Acc: 0.5500
Epoch: 008, Loss: 2.7835, Train Acc: 0.9062 Test Acc: 0.6000
Epoch: 009, Loss: 4.5466, Train Acc: 0.9375 Test Acc: 0.6250
Epoch: 010, Loss: 10.0300, Train Acc: 0.9187 Test Acc: 0.6250
Epoch: 011, Loss: 6.2774, Train Acc: 0.9688 Test Acc: 0.6750
Epoch: 012, Loss: 0.2388, Train Acc: 0.9875 Test Acc: 0.6750
Epoch: 013, Loss: 0.1808, Train Acc: 0.9875 Test Acc: 0.6500
Epoch: 014, Loss: 0.1165, Train Acc: 0.9875 Test Acc: 0.7000
Epoch: 015, Loss: 0.1542, Train Acc: 1.0000 Test Acc: 0.7250
Epoch: 016, Loss: 0.0107, Train Acc: 0.9750 Test Acc: 0.6500
Epoch: 017, Loss