In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer, AdamW
# from stanza import ConstituencyParser
import stanza


# Load the pretrained BERT model and tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the constituency parser
# parser = ConstituencyParser()
parser = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')


# Define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = bert_model
        self.attention1 = nn.TransformerEncoderLayer(d_model=768, nhead=8)
        self.attention2 = nn.TransformerEncoderLayer(d_model=768, nhead=8)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        wordpiece_embeddings = outputs[0]

        # Group WordPiece embeddings into constituent embeddings
        constituent_embeddings = []
        for i, sentence in enumerate(input_ids):
            tree = parser(sentence.tolist())
            constituents = []
            for node in tree.traverse():
                if node.label == 'NP' or node.label == 'VP':
                    constituents.append((node.start, node.end))

            constituent_embedding_list = []
            for start, end in constituents:
                constituent_embedding = torch.sum(wordpiece_embeddings[i][start:end], dim=0)
                constituent_embedding_list.append(constituent_embedding)

            constituent_embeddings.append(torch.stack(constituent_embedding_list))

        # Add two additional attention layers
        attention_output1 = self.attention1(constituent_embeddings)
        attention_output2 = self.attention2(attention_output1)

        # Pool constituent embeddings
        pooled_output = self.pool(attention_output2)

        # Classify
        logits = self.classifier(pooled_output)
        output = F.softmax(logits, dim=1)

        return output

# Define the training loop
model = NeuralNetwork()
optimizer = AdamW(model.parameters())

for epoch in range(10):
    for i, (data, target) in enumerate(dataloader):
        input_ids = data['input_ids']
        attention_mask = data['attention_mask']

        outputs = model(input_ids, attention_mask)
        loss = F.cross_entropy(outputs, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f'Epoch: {epoch}, Iteration: {i}, Loss: {loss.item():.4f}')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
2023-11-17 07:16:13 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_met

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-17 07:16:15 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2023-11-17 07:16:15 INFO: Using device: cpu
2023-11-17 07:16:15 INFO: Loading: tokenize
2023-11-17 07:16:15 INFO: Loading: pos
2023-11-17 07:16:16 INFO: Loading: constituency
2023-11-17 07:16:17 INFO: Done loading processors!


NameError: name 'dataloader' is not defined

In [10]:
# pip install stanza

In [44]:
from torch.utils.data import Dataset

class ConstituencyParsingDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]['sentence']
        target = self.data[idx]['target']

        # Tokenize the sentence using a pretrained BERT tokenizer
        encoded_inputs = bert_tokenizer(sentence, padding='max_length', truncation=True)
        input_ids = encoded_inputs['input_ids']
        attention_mask = encoded_inputs['attention_mask']

        # Preprocess the constituency parse tree
#         constituent_embeddings = preprocess_constituency_parse(target)

        return {'input_ids': input_ids,
                'attention_mask': attention_mask}

#                 'constituent_embeddings': constituent_embeddings}


In [33]:
import pandas as pd

df = pd.read_table("D:\OneDrive - NITT\Custom_Download\output_training.txt", header=None, dtype={0: str}).rename(
    columns={0: "pk", 1: "label", 2: "text"}
)
df

Unnamed: 0,pk,label,text
0,00001,0,Le infrastrutture come fattore di competitivit...
1,00002,0,Negli ultimi anni la dinamica dei polo di attr...
2,00003,0,Il raggiungimento e il mantenimento di posizio...
3,00004,0,Quest'ultimo è funzione di variabili struttura...
4,00005,1,"Il contesto milanese, se da un lato è stato te..."
...,...,...,...
9753,10170,0,Scrooge era il suo unico esecutore testamentar...
9754,10171,1,"Anzi il nostro Scrooge, che per verità il tris..."
9755,10172,0,Il ricordo dei funerali mi fa tornare al punto...
9756,10173,0,Non c'è dunque dubbio che Marley era morto.


In [34]:
df = df[:200]

In [36]:
df.drop(columns='pk',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns='pk',inplace=True)


In [40]:
df.rename({'label':'target','text':'sentence'},axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({'label':'target','text':'sentence'},axis=1,inplace=True)


In [43]:
df.target.value_counts()

0    148
1     52
Name: target, dtype: int64

In [46]:
from torch.utils.data import TensorDataset, DataLoader

In [53]:
dataloader = DataLoader(ConstituencyParsingDataset(df), batch_size=32)

In [58]:
for i, (data) in enumerate(dataloader):
    print(data)

KeyError: 0

In [12]:
import stanza

nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
doc = nlp('This is a test')
for sentence in doc.sentences:
    print(sentence.constituency)

2023-11-17 06:57:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/pos/combined_charlm.pt:   0%|  …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/constituency/ptb3-revised_charl…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/pretrain/conll17.pt:   0%|     …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/backward_charlm/1billion.pt:   …

2023-11-17 06:58:44 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| constituency | ptb3-revised_charlm |

2023-11-17 06:58:44 INFO: Using device: cpu
2023-11-17 06:58:44 INFO: Loading: tokenize
2023-11-17 06:58:44 INFO: Loading: pos
2023-11-17 06:58:45 INFO: Loading: constituency
2023-11-17 06:58:46 INFO: Done loading processors!


(ROOT (S (NP (DT This)) (VP (VBZ is) (NP (DT a) (NN test)))))


In [23]:
tree = nlp('This there a test')
for sentence in tree.sentences:
    print(sentence.constituency.children)

((NP (NP (DT This)) (NP (EX there)) (NP (DT a) (NN test))),)


In [30]:
tree.sentences[0].constituency

(ROOT (NP (NP (DT This)) (NP (EX there)) (NP (DT a) (NN test))))

In [73]:
for i in list(tree.sentences[0].constituency):
    print(i)

TypeError: 'Tree' object is not iterable

In [None]:
constituents = []
for node in tree.traverse():
    if node.label == 'NP' or node.label == 'VP':
        constituents.append((node.start, node.end))