In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os

In [3]:
# Load JSON data
print("Loading data...")
with open('data/processed/train/train.json', 'r') as f:
    train_data = json.load(f)

with open('data/processed/test/test.json', 'r') as f:
    test_data = json.load(f)

print(f"Train examples: {len(train_data)}")
print(f"Test examples: {len(test_data)}")

Loading data...
Train examples: 8000
Test examples: 2717
Train examples: 8000
Test examples: 2717


In [4]:
# Create simplified DataFrames for modeling
def prepare_dataframe(data):
    """Convert JSON data to DataFrame for modeling"""
    rows = []
    for item in data:
        # Extract relation label
        rel = item['relation']
        if rel['type'] == 'Other':
            relation_label = 'Other'
        else:
            relation_label = f"{rel['type']}{rel['direction']}"
        
        rows.append({
            'id': item['id'],
            'text': item['text'],
            'relation_label': relation_label,
            'relation_type': rel['type'],
            'entity1_text': item['entities'][0]['text'] if len(item['entities']) > 0 else None,
            'entity2_text': item['entities'][1]['text'] if len(item['entities']) > 1 else None,
            'entity1_start': item['entities'][0]['start_token'] if len(item['entities']) > 0 else None,
            'entity1_end': item['entities'][0]['end_token'] if len(item['entities']) > 0 else None,
            'entity2_start': item['entities'][1]['start_token'] if len(item['entities']) > 1 else None,
            'entity2_end': item['entities'][1]['end_token'] if len(item['entities']) > 1 else None,
            'num_tokens': len(item['tokens']),
            'tokens': item['tokens'],  # Keep full token info for advanced features
            'entities': item['entities'],
            'comment': item.get('comment')
        })
    return pd.DataFrame(rows)

train_df = prepare_dataframe(train_data)
test_df = prepare_dataframe(test_data)

print(f"\nTrain DataFrame shape: {train_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")
print(f"\nColumns: {list(train_df.columns)}")
train_df.head(3)


Train DataFrame shape: (8000, 14)
Test DataFrame shape: (2717, 14)

Columns: ['id', 'text', 'relation_label', 'relation_type', 'entity1_text', 'entity2_text', 'entity1_start', 'entity1_end', 'entity2_start', 'entity2_end', 'num_tokens', 'tokens', 'entities', 'comment']


Unnamed: 0,id,text,relation_label,relation_type,entity1_text,entity2_text,entity1_start,entity1_end,entity2_start,entity2_end,num_tokens,tokens,entities,comment
0,1,The system as described above has its greatest...,"Component-Whole(e2,e1)",Component-Whole,configuration,elements,12,13,15,16,17,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'pos...","[{'entity_id': 'e1', 'text': 'configuration', ...","Not a collection: there is structure here, org..."
1,2,The child was carefully wrapped and bound into...,Other,Other,child,cradle,1,2,9,10,16,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'pos...","[{'entity_id': 'e1', 'text': 'child', 'token_i...",
2,3,The author of a keygen uses a disassembler to ...,"Instrument-Agency(e2,e1)",Instrument-Agency,author,disassembler,1,2,7,8,16,"[{'id': 0, 'text': 'The', 'lemma': 'the', 'pos...","[{'entity_id': 'e1', 'text': 'author', 'token_...",


In [5]:
# Check relation distribution
print("Relation Type Distribution (Train):")
print(train_df['relation_type'].value_counts())
print(f"\nTotal unique relation labels (with direction): {train_df['relation_label'].nunique()}")
print(f"\nRelation label distribution (top 10):")
print(train_df['relation_label'].value_counts().head(10))

Relation Type Distribution (Train):
relation_type
Other                 1410
Cause-Effect          1003
Component-Whole        941
Entity-Destination     845
Product-Producer       717
Entity-Origin          716
Member-Collection      690
Message-Topic          634
Content-Container      540
Instrument-Agency      504
Name: count, dtype: int64

Total unique relation labels (with direction): 19

Relation label distribution (top 10):
relation_label
Other                        1410
Entity-Destination(e1,e2)     844
Cause-Effect(e2,e1)           659
Member-Collection(e2,e1)      612
Entity-Origin(e1,e2)          568
Message-Topic(e1,e2)          490
Component-Whole(e2,e1)        471
Component-Whole(e1,e2)        470
Instrument-Agency(e2,e1)      407
Product-Producer(e2,e1)       394
Name: count, dtype: int64


## 1. Label Encoding
Encode relation labels into integers for classification models

In [6]:
# Encode labels (use relation_label for full directionality, or relation_type for undirected)
label_encoder = LabelEncoder()

# Fit on train labels
train_df['label_id'] = label_encoder.fit_transform(train_df['relation_label'])
test_df['label_id'] = label_encoder.transform(test_df['relation_label'])

# Create label mappings
label2id = {label: idx for idx, label in enumerate(label_encoder.classes_)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"Number of classes: {len(label2id)}")
print(f"\nLabel mappings (first 10):")
for label, idx in list(label2id.items())[:10]:
    print(f"  {idx:2d}: {label}")

# Save label mappings
import pickle
with open('data/processed/train/model_split/label_mappings.pkl', 'wb') as f:
    pickle.dump({'label2id': label2id, 'id2label': id2label}, f)
print("\n Label mappings saved to data/processed/train/model_split/label_mappings.pkl")

Number of classes: 19

Label mappings (first 10):
   0: Cause-Effect(e1,e2)
   1: Cause-Effect(e2,e1)
   2: Component-Whole(e1,e2)
   3: Component-Whole(e2,e1)
   4: Content-Container(e1,e2)
   5: Content-Container(e2,e1)
   6: Entity-Destination(e1,e2)
   7: Entity-Destination(e2,e1)
   8: Entity-Origin(e1,e2)
   9: Entity-Origin(e2,e1)

 Label mappings saved to data/processed/train/model_split/label_mappings.pkl


## 2. Entity Marking Strategies
Different ways to mark entities in text for models

In [None]:
def add_entity_markers(text, entity1_text, entity2_text, strategy='brackets'):
    """
    Add entity markers to text using different strategies
    
    Strategies:
    - 'brackets': [E1] text [/E1] and [E2] text [/E2]
    - 'tags': <e1> text </e1> and <e2> text </e2>
    """
    if strategy == 'brackets':
        marked = text.replace(entity1_text, f"[E1]{entity1_text}[/E1]", 1)
        marked = marked.replace(entity2_text, f"[E2]{entity2_text}[/E2]", 1)
    elif strategy == 'tags':
        marked = text.replace(entity1_text, f"<e1>{entity1_text}</e1>", 1)
        marked = marked.replace(entity2_text, f"<e2>{entity2_text}</e2>", 1)
    else:
        marked = text
    
    return marked

# Apply entity markers
train_df['text_with_markers'] = train_df.apply(
    lambda row: add_entity_markers(row['text'], row['entity1_text'], row['entity2_text'], 'brackets'),
    axis=1
)

test_df['text_with_markers'] = test_df.apply(
    lambda row: add_entity_markers(row['text'], row['entity1_text'], row['entity2_text'], 'brackets'),
    axis=1
)

# Show examples
print("Examples with entity markers:\n")
for i in range(3):
    print(f"Original: {train_df.iloc[i]['text']}")
    print(f"Marked:   {train_df.iloc[i]['text_with_markers']}")
    print(f"Relation: {train_df.iloc[i]['relation_label']}\n")

Examples with entity markers:

Original: The system as described above has its greatest application in an arrayed configuration of antenna elements.
Marked:   The system as described above has its greatest application in an arrayed [E1]configuration[/E1] of antenna [E2]elements[/E2].
Relation: Component-Whole(e2,e1)

Original: The child was carefully wrapped and bound into the cradle by means of a cord.
Marked:   The [E1]child[/E1] was carefully wrapped and bound into the [E2]cradle[/E2] by means of a cord.
Relation: Other

Original: The author of a keygen uses a disassembler to look at the raw assembly code.
Marked:   The [E1]author[/E1] of a keygen uses a [E2]disassembler[/E2] to look at the raw assembly code.
Relation: Instrument-Agency(e2,e1)



## 3. Train/Validation Split
Create validation set from training data

In [8]:
# Split training data into train/val (stratified by relation type)
train_split, val_split = train_test_split(
    train_df,
    test_size=0.15,  # 15% for validation
    random_state=42,
    stratify=train_df['relation_type']  # Maintain class distribution
)

print(f"Training set: {len(train_split)} examples")
print(f"Validation set: {len(val_split)} examples")
print(f"Test set: {len(test_df)} examples")

print(f"\nRelation distribution in splits:")
print("\nTrain:")
print(train_split['relation_type'].value_counts(normalize=True).head())
print("\nValidation:")
print(val_split['relation_type'].value_counts(normalize=True).head())

Training set: 6800 examples
Validation set: 1200 examples
Test set: 2717 examples

Relation distribution in splits:

Train:
relation_type
Other                 0.176324
Cause-Effect          0.125441
Component-Whole       0.117647
Entity-Destination    0.105588
Entity-Origin         0.089559
Name: proportion, dtype: float64

Validation:
relation_type
Other                 0.175833
Cause-Effect          0.125000
Component-Whole       0.117500
Entity-Destination    0.105833
Product-Producer      0.090000
Name: proportion, dtype: float64


## 4. Save Preprocessed Data
Save cleaned DataFrames for easy loading in model training scripts

In [9]:
# Save to pickle for fast loading (preserves all data types)
train_split.to_pickle('data/processed/train/model_split/train_split.pkl')
val_split.to_pickle('data/processed/train/model_split/val_split.pkl')
test_df.to_pickle('data/processed/test/test_df.pkl')

# Also save simple CSVs (without complex columns like tokens)
simple_cols = ['id', 'text', 'text_with_markers', 'relation_label', 'relation_type', 
               'entity1_text', 'entity2_text', 'label_id', 'num_tokens']

train_split[simple_cols].to_csv('data/processed/train/model_split/train_split.csv', index=False)
val_split[simple_cols].to_csv('data/processed/train/model_split/val_split.csv', index=False)
test_df[simple_cols].to_csv('data/processed/test/test_df.csv', index=False)

print("Preprocessed data saved:")
print("- data/processed/train/model_split/train_split.pkl & .csv")
print("- data/processed/train/model_split/val_split.pkl & .csv")
print("- data/processed/train/model_split/label_mappings.pkl")
print("- data/processed/test/test_df.pkl & .csv")

Preprocessed data saved:
- data/processed/train/model_split/train_split.pkl & .csv
- data/processed/train/model_split/val_split.pkl & .csv
- data/processed/train/model_split/label_mappings.pkl
- data/processed/test/test_df.pkl & .csv


## 5. Quick Data Summary

In [10]:
print("=" * 80)
print("DATA READY FOR MODELING")
print("=" * 80)
print(f"\n Train: {len(train_split)} examples")
print(f" Validation: {len(val_split)} examples")
print(f" Test: {len(test_df)} examples")
print(f" Number of classes: {len(label2id)}")
print(f" Average tokens per sentence: {train_split['num_tokens'].mean():.1f}")

print(f"\n Features available:")
print(f" - text: original sentence")
print(f" - text_with_markers: sentence with entity markers [E1]...[/E1]")
print(f" - tokens: full token info (lemma, POS, feats, deps, etc.)")
print(f" - entities: entity spans with positions")
print(f" - label_id: encoded relation label (0-{len(label2id)-1})")

print("\n" + "=" * 80)

DATA READY FOR MODELING

 Train: 6800 examples
 Validation: 1200 examples
 Test: 2717 examples
 Number of classes: 19
 Average tokens per sentence: 19.3

 Features available:
 - text: original sentence
 - text_with_markers: sentence with entity markers [E1]...[/E1]
 - tokens: full token info (lemma, POS, feats, deps, etc.)
 - entities: entity spans with positions
 - label_id: encoded relation label (0-18)

 Ready for:
 - Transformer models (BERT, RoBERTa): use 'text_with_markers'
 - Traditional ML (SVM, RF): extract features from 'tokens'
 - Neural networks (LSTM, CNN): use tokenized 'text'



# Splitting CONNL-U File same wplits with train

In [11]:
# Replace the ID-based splitting with position-based splitting

import conllu

# Read and parse CoNLL-U file
print("Reading train.conllu with conllu library...")
with open('data/processed/train/train.conllu', 'r', encoding='utf-8') as f:
    sentences = conllu.parse(f.read())

print(f"Total sentences in CoNLL-U: {len(sentences)}")
print(f"Total in train_df: {len(train_df)}")

Reading train.conllu with conllu library...
Total sentences in CoNLL-U: 8000
Total in train_df: 8000


In [12]:
# Create a mapping from original DataFrame position to sentence
# The CoNLL-U sentences are in the same order as train_df
train_sentences = []
val_sentences = []

# Get the indices of train and val splits from the original train_df
train_indices = set(train_split.index.tolist())
val_indices = set(val_split.index.tolist())

print(f"\nTrain indices: {len(train_indices)}")
print(f"Val indices: {len(val_indices)}")


Train indices: 6800
Val indices: 1200


In [13]:
# Split based on original DataFrame index
for idx, sent in enumerate(sentences):
    if idx in train_indices:
        train_sentences.append(sent)
    elif idx in val_indices:
        val_sentences.append(sent)
    else:
        print(f"Warning: Index {idx} not found in train or val splits")

print(f"\nTrain CoNLL-U sentences: {len(train_sentences)}")
print(f"Val CoNLL-U sentences: {len(val_sentences)}")


Train CoNLL-U sentences: 6800
Val CoNLL-U sentences: 1200


In [14]:
# Save split CoNLL-U files
train_conllu_path = 'data/processed/train/model_split/train_split.conllu'
val_conllu_path = 'data/processed/train/model_split/val_split.conllu'

with open(train_conllu_path, 'w', encoding='utf-8') as f:
    f.write('\n\n'.join([sent.serialize() for sent in train_sentences]))

with open(val_conllu_path, 'w', encoding='utf-8') as f:
    f.write('\n\n'.join([sent.serialize() for sent in val_sentences]))

print(f"\n CoNLL-U files saved:")
print(f"  - {train_conllu_path}")
print(f"  - {val_conllu_path}")

# Verify the splits match
print(f"\n Verification:")
print(f"  DataFrame train: {len(train_split)} | CoNLL-U train: {len(train_sentences)}")
print(f"  DataFrame val: {len(val_split)} | CoNLL-U val: {len(val_sentences)}")
assert len(train_split) == len(train_sentences), "Train split mismatch!"
assert len(val_split) == len(val_sentences), "Val split mismatch!"
print("   All splits match!")


 CoNLL-U files saved:
  - data/processed/train/model_split/train_split.conllu
  - data/processed/train/model_split/val_split.conllu

 Verification:
  DataFrame train: 6800 | CoNLL-U train: 6800
  DataFrame val: 1200 | CoNLL-U val: 1200
   All splits match!
