In [1]:
### imports 

import spacy
import networkx as nx
import numpy as np
import os
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import MessagePassing, global_add_pool
from sklearn.model_selection import train_test_split
import pandas as pd 



In [2]:
# Load a pre-trained word embedding model
nlp = spacy.load('en_core_web_md')

In [3]:
df = pd.read_csv('data/train.csv')
df = df.sample(frac=1, random_state = 42).reset_index(drop=True)
df.head()

Unnamed: 0,review,label
0,This movie is another Christian propaganda fil...,0.0
1,A woman who hates cats (Alice Krige) and her s...,1.0
2,"Beast Wars is a show that is over-hyped, overp...",0.0
3,"An excellent example of ""cowboy noir"", as it's...",1.0
4,"Ok, basically this is a popcorn sci-fi movie, ...",1.0


### Function 2 : Les biagrams 

In [4]:
def biagram_preprocessing_of_text(review, label): 

    ## sentences preprocessing
    doc = nlp(review)
    sentences = [sent for sent in doc.sents]
    sentences = [[token.text.lower() for token in sent if not token.is_stop and token.is_alpha] for sent in sentences]
    sentences = [[word for word in sent if len(word) > 1] for sent in sentences]

    ## get the biagrams
    biagrams = []
    for sent in sentences:
        for i in range(len(sent)-1):
            biagrams.append((sent[i], sent[i+1]))
    
    ### dico of how many times a biagram appears in the review
    dico_biagrams = {}
    for biagram in biagrams:
        if biagram not in dico_biagrams and (biagram[1], biagram[0]) not in dico_biagrams:
            dico_biagrams[biagram] = 1
        elif biagram in dico_biagrams:
            dico_biagrams[biagram] += 1
        elif (biagram[1], biagram[0]) in dico_biagrams:
            dico_biagrams[(biagram[1], biagram[0])] += 1
        

    list_of_words = [word for sent in sentences for word in sent]
    list_of_words = list(set(list_of_words))
    ## create graph 
    G = nx.Graph()
    ## nodes as words 
    G.add_nodes_from(list_of_words)

    ## add edges
    for biagram in dico_biagrams.keys():
        G.add_edge(biagram[0], biagram[1], weight = dico_biagrams[biagram])

    
    # Get the node features
    node_features = []
    for node in G.nodes():
        node_features.append(nlp.vocab[node].vector)
    node_features = np.array(node_features)
    # Get the edges
    edges = []
    for edge in G.edges():
        edges.append([list(G.nodes()).index(edge[0]), list(G.nodes()).index(edge[1])])
    edges = np.array(edges)
    ## get edges 
    edges_attr  = []
    for edge in G.edges():
        edges_attr.append([G.edges[edge]['weight']])
    edges_attr = np.array(edges_attr)
    
    # Get the label
    label_value = int(label)


    # Create a PyTorch Geometric Data object
    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edges.T, dtype=torch.long)
    y = torch.tensor(label_value, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, edge_attr = torch.tensor(edges_attr, dtype=torch.float), y=y) ### here carefull, by adding the weights, the results are worse ! 
    
    return data

### Attention graphs ( does not work, takes forever to run, do not use !)

I had done previous tests by getting the attention weights from a bert and not one layer and it took more time so I abandonned that and this is my attempt at attention 

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentionLayer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads=1):
        super(AttentionLayer, self).__init__()
        self.attention = nn.MultiheadAttention(input_dim, num_heads)
        self.linear = nn.Linear(input_dim, hidden_dim)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
        # Permute the tensor to have shape (sentence_length, batch_size, input_dim)
        x = x.permute(1, 0, 2)
        # Apply linear transformation to get query, key, and value tensors
        q = self.linear(x)
        k = self.linear(x)
        v = self.linear(x)
        # Apply multi-head attention
        attn_output, _ = self.attention(q, k, v)
        # Reshape the output and apply dropout
        attn_output = attn_output.permute(1, 0, 2)
        attn_output = self.dropout(attn_output)
        # Apply linear transformation to get hidden representations
        hidden = self.linear(attn_output)
        # Apply dropout and return the output
        output = self.dropout(hidden)
        return output


def attention_preprocessing(review, label) :
    embedding_dim = 768
    hidden_dim = 768   
    input_dim = embedding_dim


    doc = nlp(review)
    sentences = [sent for sent in doc.sents]
    sentences = [[token.text.lower() for token in sent if not token.is_stop and token.is_alpha] for sent in sentences]
    sentences = [[word for word in sent if len(word) > 1] for sent in sentences]

    G = nx.Graph()
    list_of_words = [word for sent in sentences for word in sent]
    list_of_words = list(set(list_of_words))

    ## nodes as words 
    G.add_nodes_from(list_of_words)
    ## add edges 
    for sent in sentences:
        tokens = sent 
        input_tensor = torch.randn(1, len(tokens), input_dim)
        embeddings = input_tensor.transpose(0, 1)
        output = AttentionLayer(input_dim, hidden_dim, num_heads=1)(embeddings)
        output = output.transpose(0, 1)
        output = torch.mean(output, dim=2)

        ## loop in tokens and add edges with the attention weights
        for i in range(len(tokens)):
            for j in range(len(tokens)):
                if i != j:
                    if (tokens[i], tokens[j]) not in G.edges() and (tokens[j], tokens[i]) not in G.edges():
                        if output[0][i].item() > 0:
                            G.add_edge(tokens[i], tokens[j], weight=output[0][i].item())

    
    
    # Get the node features
    node_features = []
    for node in G.nodes():
        node_features.append(nlp.vocab[node].vector)
    node_features = np.array(node_features)
    # Get the edges
    edges = []
    for edge in G.edges():
        edges.append([list(G.nodes()).index(edge[0]), list(G.nodes()).index(edge[1])])
    edges = np.array(edges)

    ## get edge_attr
    edges_attr  = []
    for edge in G.edges():
        edges_attr.append([G.edges[edge]['weight']])
    edges_attr = np.array(edges_attr)


    # Get the label
    label_value = int(label)

    # Create a PyTorch Geometric Data object
    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edges.T, dtype=torch.long)
    y = torch.tensor(label_value, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, edge_attr = torch.tensor(edges_attr, dtype=torch.float), y=y)


    return data

### Work in progress, create graphs depending on adjectifs... 

The idea is to join bigrams and this method to create graphs with bigrams + wights on what type of connection it is, and not number of time that connection is present. 

In [6]:

def adjectif_preprocessing(review, label, look_for = 'ADJ') : 
    doc = nlp(review) 


    sentences = [sent for sent in doc.sents]
    sentences = [{token.text.lower() : (token.pos_, token.dep_) for token in sent if not token.is_stop and token.is_alpha} for sent in sentences]
    
    G = nx.Graph()
    ## get the keys of the dictionary
    list_of_words = [word for sent in sentences for word in sent.keys()]
    ## get the unique words
    list_of_adverbs = []
    for sent in sentences:
        for word in sent:
            if sent[word][0] == look_for:
                list_of_adverbs.append(word)


    
    list_of_word_set = list(set(list_of_adverbs))

    ## nodes as words 
    G.add_nodes_from(list_of_word_set)
    ## add edges 

    ## create edges between each word in list_of_word
    for word in list_of_word_set:
        for word2 in list_of_word_set:
            if word != word2:
                G.add_edge(word, word2, weight=1)
    


        # Get the node features, the node feature are the word embeddings
    node_features = []
    for node in G.nodes():
        node_features.append(nlp.vocab[node].vector)
    node_features = np.array(node_features)

    # Get the edges
    edges = []
    for edge in G.edges():
        edges.append([list(G.nodes()).index(edge[0]), list(G.nodes()).index(edge[1])])
    edges = np.array(edges)

    ## get edge_attr
    edges_attr  = []
    for edge in G.edges():
        edges_attr.append([G.edges[edge]['weight']])
    edges_attr = np.array(edges_attr)

    
    # Get the label
    label_value = int(label)

    # Create a PyTorch Geometric Data object
    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edges.T, dtype=torch.long)
    y = torch.tensor(label_value, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, edge_attr= torch.tensor(edges_attr, dtype=torch.float), y=y)


    return data


### Good preprocessing !! 

In [7]:
def biagram_depending_on_link(review, label):

    weights_each_type = {'ADJ': 3, 'ADV': 2, 'NOUN': 1, 'VERB': 4, 'ADP': 1, 'DET': 1, 'NUM': 1, 'PUNCT': 1, 'PRON': 1, 'PROPN': 1, 'SCONJ': 1, 'SYM': 1, 'X': 1, 'PART': 1, 'CCONJ': 1, 'INTJ': 1, 'AUX': 1, 'SPACE': 1, '': 1}

    ## sentences preprocessing
    doc = nlp(review)
    sentences = [sent for sent in doc.sents]
    sentences = [{token.text.lower() : (token.pos_, token.dep_) for token in sent if not token.is_stop and token.is_alpha} for sent in sentences]

    ## get the biagrams
    biagrams = []
    for sent in sentences:
        for i in range(len(sent)-1):
            biagrams.append((list(sent.keys())[i], list(sent.keys())[i+1]))

    
    ### concatenate all the sentences in one
    sent = {}
    for sentence in sentences:
        sent.update(sentence)

    ### dico of how many times a biagram appears in the review
    dico_biagrams = {}
    for biagram in biagrams:
        if biagram not in dico_biagrams and (biagram[1], biagram[0]) not in dico_biagrams:
            dico_biagrams[biagram] = weights_each_type[sent[biagram[0]][0]] + weights_each_type[sent[biagram[1]][0]]
        elif biagram in dico_biagrams:
            dico_biagrams[biagram] +=  weights_each_type[sent[biagram[0]][0]] + weights_each_type[sent[biagram[1]][0]]
        elif (biagram[1], biagram[0]) in dico_biagrams:
            dico_biagrams[(biagram[1], biagram[0])] += weights_each_type[sent[biagram[0]][0]] + weights_each_type[sent[biagram[1]][0]]
        

    list_of_words = [word for sent in sentences for word in sent]
    list_of_words = list(set(list_of_words))
    ## create graph 
    G = nx.Graph()
    ## nodes as words 
    G.add_nodes_from(list_of_words)

    ## add edges
    for biagram in dico_biagrams.keys():
        G.add_edge(biagram[0], biagram[1], weight = dico_biagrams[biagram])

    
    # Get the node features
    node_features = []
    for node in G.nodes():
        node_features.append(nlp.vocab[node].vector)
    node_features = np.array(node_features)
    # Get the edges
    edges = []
    for edge in G.edges():
        edges.append([list(G.nodes()).index(edge[0]), list(G.nodes()).index(edge[1])])
    edges = np.array(edges)
    ## edge_attr 
    edges_attr  = []
    for edge in G.edges():
        edges_attr.append([G.edges[edge]['weight']])
    edges_attr = np.array(edges_attr)
    
    # Get the label
    label_value = int(label)


    # Create a PyTorch Geometric Data object
    x = torch.tensor(node_features, dtype=torch.float)
    edge_index = torch.tensor(edges.T, dtype=torch.long)
    y = torch.tensor(label_value, dtype=torch.float)
    edge_attr = torch.tensor(edges_attr, dtype=torch.float)
    data = Data(x=x, edge_index=edge_index, edge_attr = edge_attr, y=y)
    
    return data

### Tests

In [8]:
### Test on 10 reviews 
list_of_reviews = [] 

for i in range(10):
    data = biagram_preprocessing_of_text(df['review'][i], df['label'][i])
    list_of_reviews.append(data)
    

In [9]:
list_of_reviews

[Data(x=[57, 300], edge_index=[2, 58], edge_attr=[58, 1], y=0.0),
 Data(x=[49, 300], edge_index=[2, 44], edge_attr=[44, 1], y=1.0),
 Data(x=[54, 300], edge_index=[2, 47], edge_attr=[47, 1], y=0.0),
 Data(x=[173, 300], edge_index=[2, 186], edge_attr=[186, 1], y=1.0),
 Data(x=[40, 300], edge_index=[2, 36], edge_attr=[36, 1], y=1.0),
 Data(x=[75, 300], edge_index=[2, 70], edge_attr=[70, 1], y=0.0),
 Data(x=[38, 300], edge_index=[2, 32], edge_attr=[32, 1], y=0.0),
 Data(x=[33, 300], edge_index=[2, 28], edge_attr=[28, 1], y=1.0),
 Data(x=[50, 300], edge_index=[2, 47], edge_attr=[47, 1], y=0.0),
 Data(x=[43, 300], edge_index=[2, 41], edge_attr=[41, 1], y=0.0)]

In [10]:
### Test on 10 reviews 
list_of_reviews = [] 

for i in range(10):
    data = adjectif_preprocessing(df['review'][i], df['label'][i])
    list_of_reviews.append(data)

In [11]:
list_of_reviews

[Data(x=[6, 300], edge_index=[2, 15], edge_attr=[15, 1], y=0.0),
 Data(x=[8, 300], edge_index=[2, 28], edge_attr=[28, 1], y=1.0),
 Data(x=[5, 300], edge_index=[2, 10], edge_attr=[10, 1], y=0.0),
 Data(x=[38, 300], edge_index=[2, 703], edge_attr=[703, 1], y=1.0),
 Data(x=[6, 300], edge_index=[2, 15], edge_attr=[15, 1], y=1.0),
 Data(x=[15, 300], edge_index=[2, 105], edge_attr=[105, 1], y=0.0),
 Data(x=[7, 300], edge_index=[2, 21], edge_attr=[21, 1], y=0.0),
 Data(x=[6, 300], edge_index=[2, 15], edge_attr=[15, 1], y=1.0),
 Data(x=[11, 300], edge_index=[2, 55], edge_attr=[55, 1], y=0.0),
 Data(x=[6, 300], edge_index=[2, 15], edge_attr=[15, 1], y=0.0)]

In [12]:
### Test on 2 reviews 
list_of_reviews = [] 

for i in range(2):
    data = attention_preprocessing(df['review'][i], df['label'][i])
    list_of_reviews.append(data)

list_of_reviews

[Data(x=[57, 300], edge_index=[2, 205], edge_attr=[205, 1], y=0.0),
 Data(x=[49, 300], edge_index=[2, 352], edge_attr=[352, 1], y=1.0)]

In [13]:
list_of_reviews = [] 

for i in range(5):
    data = biagram_depending_on_link(df['review'][i], df['label'][i])
    list_of_reviews.append(data)

list_of_reviews

[Data(x=[57, 300], edge_index=[2, 55], edge_attr=[55, 1], y=0.0),
 Data(x=[49, 300], edge_index=[2, 44], edge_attr=[44, 1], y=1.0),
 Data(x=[54, 300], edge_index=[2, 47], edge_attr=[47, 1], y=0.0),
 Data(x=[173, 300], edge_index=[2, 185], edge_attr=[185, 1], y=1.0),
 Data(x=[40, 300], edge_index=[2, 36], edge_attr=[36, 1], y=1.0)]