# Extract and read dataset

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) 
# will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/new-york-times-relation-extraction-dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/new-york-times-relation-extraction-dataset/dataset/train.json
/kaggle/input/new-york-times-relation-extraction-dataset/dataset/test.json
/kaggle/input/new-york-times-relation-extraction-dataset/dataset/valid.json


In [2]:
# Function to extract relations from a JSON object
def extract_relations(json_line):
    relations = []
    sent_text = json_line['sentText']
    for relation in json_line['relationMentions']:
        em1_text = relation['em1Text']
        em2_text = relation['em2Text']
        label = relation['label']
        relations.append((sent_text, em1_text, em2_text, label))
    return relations

In [3]:
import json

# Define the file path
input_file = '/kaggle/input/new-york-times-relation-extraction-dataset/dataset/train.json'

# Initialize a list to hold the data
data = []

# Initialize a counter for the number of lines processed
count = 0

# Read the JSON file
with open(input_file, 'r') as file:
    for line in file:
        json_line = json.loads(line)  # Load JSON object from the line
        relations = extract_relations(json_line)  # Extract relations
        data.extend(relations)  # Add extracted relations to the data list
        count += 1  # Increment the counter

In [4]:
# Print the number of lines processed and the first few extracted relations
print(f"Processed {count} lines.")
print(f"First few extracted relations: {data[:5]}")

Processed 56196 lines.
First few extracted relations: [('Massachusetts ASTON MAGNA Great Barrington ; also at Bard College , Annandale-on-Hudson , N.Y. , July 1-Aug .', 'Annandale-on-Hudson', 'Bard College', '/location/location/contains'), ('North Carolina EASTERN MUSIC FESTIVAL Greensboro , June 25-July 30 .', 'North Carolina', 'Greensboro', '/location/location/contains'), ("It will be the final movie credited to Debra Hill , a film producer and native of Haddonfield , who produced '' Halloween '' and was considered a pioneering woman in film .", 'Debra Hill', 'Haddonfield', '/people/person/place_of_birth'), ("In a 3-0 victory over the Crew on Saturday in Columbus , Ohio , goalkeeper Zach Wells stopped Kyle Martino 's penalty kick , only the third unsuccessful penalty in 20 attempts in M.L.S. this season .", 'Ohio', 'Columbus', '/location/location/contains'), ("The United States ambassador to Mexico , Tony Garza , said in a statement that he had directed the American Consulate in Nuev

In [5]:
# Convert the extracted data to a pandas DataFrame
df = pd.DataFrame(data, columns=['sentence', 'entity1', 'entity2', 'label'])

In [7]:
print(df.head(1))
print(df.info())

                                            sentence              entity1  \
0  Massachusetts ASTON MAGNA Great Barrington ; a...  Annandale-on-Hudson   

        entity2                        label  
0  Bard College  /location/location/contains  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94222 entries, 0 to 94221
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  94222 non-null  object
 1   entity1   94222 non-null  object
 2   entity2   94222 non-null  object
 3   label     94222 non-null  object
dtypes: object(4)
memory usage: 2.9+ MB
None


In [8]:
#Change the first letter to an empty string in the label column
df['label'] = df['label'].str[1:]
df['label'] = df['label'].str.replace('/', '_')
df.head()

Unnamed: 0,sentence,entity1,entity2,label
0,Massachusetts ASTON MAGNA Great Barrington ; a...,Annandale-on-Hudson,Bard College,location_location_contains
1,North Carolina EASTERN MUSIC FESTIVAL Greensbo...,North Carolina,Greensboro,location_location_contains
2,It will be the final movie credited to Debra H...,Debra Hill,Haddonfield,people_person_place_of_birth
3,In a 3-0 victory over the Crew on Saturday in ...,Ohio,Columbus,location_location_contains
4,"The United States ambassador to Mexico , Tony ...",Mexico,Nuevo Laredo,location_location_contains


In [9]:
# Print the unique labels in the label column
unique_labels = df['label'].unique()
sorted(unique_labels)

['business_company_advisors',
 'business_company_founders',
 'business_company_industry',
 'business_company_major_shareholders',
 'business_company_place_founded',
 'business_company_shareholder_major_shareholder_of',
 'business_person_company',
 'location_administrative_division_country',
 'location_country_administrative_divisions',
 'location_country_capital',
 'location_location_contains',
 'location_neighborhood_neighborhood_of',
 'people_deceased_person_place_of_death',
 'people_ethnicity_geographic_distribution',
 'people_ethnicity_people',
 'people_person_children',
 'people_person_ethnicity',
 'people_person_nationality',
 'people_person_place_lived',
 'people_person_place_of_birth',
 'people_person_profession',
 'people_person_religion',
 'sports_sports_team_location',
 'sports_sports_team_location_teams']

In [10]:
#filter similar labels and compare
df[df['label'] == 'location_administrative_division_country'].iloc[0]

sentence    Quebec , Canada 's second most populous provin...
entity1                                               Ontario
entity2                                                Canada
label                location_administrative_division_country
Name: 11, dtype: object

In [11]:
df[df['label'] == 'location_country_administrative_divisions'].iloc[0]

sentence    Quebec , Canada 's second most populous provin...
entity1                                                Canada
entity2                                               Ontario
label               location_country_administrative_divisions
Name: 13, dtype: object

In [12]:
# Display the sentences with more than one label
sentence_label_counts = df.groupby('sentence')['label'].nunique()

# Filter to find sentences that have more than one unique label
multi_label_sentences = sentence_label_counts[sentence_label_counts > 1]


multi_label_sentences_list = df[df['sentence'].isin(multi_label_sentences.index)]
multi_label_sentences_list.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47602 entries, 8 to 94221
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  47602 non-null  object
 1   entity1   47602 non-null  object
 2   entity2   47602 non-null  object
 3   label     47602 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [13]:
# Define pairs of similar labels
similar_labels_pairs = [
    ('business_company_major_shareholders', 'business_company_shareholder_major_shareholder_of'),
    ('location_administrative_division_country', 'location_country_administrative_divisions'),
    ('sports_sports_team_location', 'sports_sports_team_location_teams')
]

# Initialize a list to store sentences where these pairs occur
sentences_with_similar_labels = []

# Loop through each pair and find sentences where both labels are present
for label1, label2 in similar_labels_pairs:
    sentences_with_both_labels = df[df['label'].isin([label1, label2])]['sentence']
    common_sentences = sentences_with_both_labels.value_counts()[sentences_with_both_labels.value_counts() > 1].index.tolist()
    sentences_with_similar_labels.extend(common_sentences)

# Remove duplicates from the list
sentences_with_similar_labels = list(set(sentences_with_similar_labels))

# Display the sentences with the similar labels
similar_label_examples = df[df['sentence'].isin(sentences_with_similar_labels)]
similar_label_examples.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21214 entries, 11 to 94202
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  21214 non-null  object
 1   entity1   21214 non-null  object
 2   entity2   21214 non-null  object
 3   label     21214 non-null  object
dtypes: object(4)
memory usage: 828.7+ KB


In [14]:
# Create a dictionary to map old labels to new consolidated labels
label_mapping = {
    'business_company_shareholder_major_shareholder_of': 'business_company_major_shareholders',
    'location_country_administrative_divisions': 'location_administrative_division_country',
    'sports_sports_team_location_teams': 'sports_sports_team_location'
}

# Apply the mapping to the dataframe
df['label'] = df['label'].replace(label_mapping)

# Verify that the labels have been grouped correctly
df[df['sentence'].isin(sentences_with_similar_labels)]


Unnamed: 0,sentence,entity1,entity2,label
11,"Quebec , Canada 's second most populous provin...",Ontario,Canada,location_administrative_division_country
12,"Quebec , Canada 's second most populous provin...",Canada,Ontario,location_location_contains
13,"Quebec , Canada 's second most populous provin...",Canada,Ontario,location_administrative_division_country
19,A French court sentenced six Algerian-French m...,Paris,France,location_administrative_division_country
20,A French court sentenced six Algerian-French m...,France,Paris,location_location_contains
...,...,...,...,...
94198,"The United Arab Emirates deserves a serious , ...",Dubai,United Arab Emirates,location_administrative_division_country
94199,"The United Arab Emirates deserves a serious , ...",United Arab Emirates,Dubai,location_administrative_division_country
94200,It 's easy to imagine how the Bush administrat...,United Arab Emirates,Dubai,location_administrative_division_country
94201,It 's easy to imagine how the Bush administrat...,Dubai,United Arab Emirates,location_administrative_division_country


In [15]:
unique_labels = df['label'].unique()
sorted(unique_labels)
#21 labels

['business_company_advisors',
 'business_company_founders',
 'business_company_industry',
 'business_company_major_shareholders',
 'business_company_place_founded',
 'business_person_company',
 'location_administrative_division_country',
 'location_country_capital',
 'location_location_contains',
 'location_neighborhood_neighborhood_of',
 'people_deceased_person_place_of_death',
 'people_ethnicity_geographic_distribution',
 'people_ethnicity_people',
 'people_person_children',
 'people_person_ethnicity',
 'people_person_nationality',
 'people_person_place_lived',
 'people_person_place_of_birth',
 'people_person_profession',
 'people_person_religion',
 'sports_sports_team_location']

# Preprocess

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [17]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', local_files_only=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
# 1. Preprocess the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
label_encoder = {label: i for i, label in enumerate(df['label'].unique())}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
# Check the maximum length of the tokenized labels
label_lengths = [len(tokenizer.encode(label, add_special_tokens=True)) for label in df['label'].unique()]
max_label_length = max(label_lengths)
print(f"Maximum label length: {max_label_length}")

Maximum label length: 13


In [20]:
def preprocess_data(row):
    sentence = row['sentence']
    # Tokenize sentence
    tokens = tokenizer.tokenize(sentence)

    # Encode entities positions
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    ).to(device)

    return {
        'input_ids': encoded_sentence['input_ids'].squeeze(),
        'attention_mask': encoded_sentence['attention_mask'].squeeze(),
        'label': torch.tensor(label_encoder[row['label']])
    }

In [21]:
#apply preprocess to dataset
train_data = train_df.apply(preprocess_data, axis=1).tolist()
val_data = val_df.apply(preprocess_data, axis=1).tolist()

In [22]:
# 2. Create custom dataset class
class RelationshipDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [24]:
# Create dataset and dataloader
train_dataset = RelationshipDataset(train_data)
val_dataset = RelationshipDataset(val_data)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [26]:
class RelationshipExtractionModel(torch.nn.Module):
    def __init__(self, num_labels):
        #the final layer (classifier of num_labels) that performs the classification is new and 
        # hasn't been trained yet, hence it is "newly initialized."
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
        

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

In [29]:
import torch
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

In [28]:
# 4. Set up training loop
num_labels = len(df['label'].unique())
model = RelationshipExtractionModel(num_labels)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Training configuration
num_epochs = 5  # Increased number of epochs
learning_rate = 2.5e-5

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RelationshipExtractionModel(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, o

In [31]:
# Optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=3, gamma=0.1)  
# Reduce LR every 3 epochs by a factor of 0.1

# Training Loop

In [37]:
from sklearn.metrics import accuracy_score, f1_score

def evaluate(model, dataloader):
    model.eval()
    total_preds = []
    total_labels = []
    total_val_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = torch.nn.functional.cross_entropy(outputs, labels)
            total_val_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            total_preds.extend(preds.cpu().tolist())
            total_labels.extend(labels.cpu().tolist())

    avg_val_loss = total_val_loss / len(dataloader)
    val_accuracy = accuracy_score(total_labels, total_preds)
    val_f1 = f1_score(total_labels, total_preds, average='weighted')  # or 'macro', 'micro', depending on your needs

    return avg_val_loss, val_accuracy, val_f1

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = torch.nn.functional.cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'train_loss': total_loss / (progress_bar.n + 1)})

    # Validation step after each epoch
    avg_val_loss, val_accuracy, val_f1 = evaluate(model, val_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} completed. Avg Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")

    # Step the learning rate scheduler
    scheduler.step()

Epoch 1/5:   0%|          | 0/4712 [00:00<?, ?it/s]

Epoch 1/5 completed. Avg Val Loss: 0.6370, Val Accuracy: 0.7079, Val F1: 0.7037


Epoch 2/5:   0%|          | 0/4712 [00:00<?, ?it/s]

Epoch 2/5 completed. Avg Val Loss: 0.5864, Val Accuracy: 0.7158, Val F1: 0.6995


Epoch 3/5:   0%|          | 0/4712 [00:00<?, ?it/s]

Epoch 3/5 completed. Avg Val Loss: 0.6137, Val Accuracy: 0.7155, Val F1: 0.6963


Epoch 4/5:   0%|          | 0/4712 [00:00<?, ?it/s]

Epoch 4/5 completed. Avg Val Loss: 0.5908, Val Accuracy: 0.7146, Val F1: 0.6942


Epoch 5/5:   0%|          | 0/4712 [00:00<?, ?it/s]

In [None]:
import torch

# Save the model's state dict
model_save_path = "model.pth"
torch.save(model.state_dict(), model_save_path)

print(f"Model saved to {model_save_path}")

## Test

In [None]:
from transformers import BartTokenizer, AutoModelForSeq2SeqLM

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large')
model.to(device)  # Make sure the model is on the correct device

In [None]:
def predict_relationship(sentence):
    # Tokenize the input sentence
    encoded_sentence = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Move tensors to the appropriate device
    input_ids = encoded_sentence['input_ids'].to(device)
    attention_mask = encoded_sentence['attention_mask'].to(device)

    # Generate output using the model (generating a text output that could describe the relationship)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,  # Adjust max_length based on expected output length
            num_beams=4,    # Beam search for better results, adjust as needed
            early_stopping=True
        )

    # Decode the generated tokens to get the predicted relationship
    predicted_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return predicted_sentence



In [None]:
# Example usage:
sentence = "IBM is an American multinational technology company headquartered in Armonk"
predicted_relationship = predict_relationship(sentence)
print(f"Predicted relationship: {predicted_relationship}")