# Relation Extraction System using Sequential Transfer Learning

This notebook deals with handling the text data by deriving relationships between entities in each text input. Instead of just using available relation extraction methods, I tried to incorporate transfer learning model that learnt from a large pool of unstructured dataset and thus further fine-tuning it to labeled data of desired task. <br>
This makes the accuracy and the performance go up comparatively. This work is based on a paper named "Neural Sequential Transfer Learning for Relation Extraction"  by Christoph Benedikt Alt M.Sc.  ORCID: 0000-0002-0500-8250. knowledge credits to the makers of the paper.

### Import Libraries

In [8]:
import pandas as pd
import numpy as np
import spacy
import os

from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

###  Load a SpaCy Model for Entity Recognition & Tokenization

In [11]:
nlp = spacy.load("en_core_web_sm")

### Load the dataset

In [14]:
# Function to read and preprocess text files
def load_and_preprocess_files(folder_path):
    texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                content = file.read()
                # Preprocess content (example: tokenize)
                doc = nlp(content)
                tokens = ' '.join([token.text for token in doc])
                texts.append(tokens)
                
    return texts

In [16]:
folder_path = 'C:/Personal/ML/Relation Extraction Project/data/textfiles'

In [18]:
texts = load_and_preprocess_files(folder_path)

In [20]:
data = pd.DataFrame({'text': texts})

In [22]:
data.head()

Unnamed: 0,text
0,\n This paper introduces a system for catego...
1,\n This paper presents a new approach to sta...
2,\n This paper describes a domain independent...
3,"\n In this paper , we describe the pronomi..."
4,\n In our current research into the design of ...


### Initialize a BERT Tokenizer

In [25]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### Pre-processing

In [28]:
def preprocess_sentence(sentence):
    # Process the sentence with spaCy
    doc = nlp(sentence)
    
    # Get tokenized sentence and entities
    tokens = [token.text for token in doc]
    entities = [(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents]
    
    # Convert tokens to IDs
    token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))
    
    # Generate position embeddings relative to each entity
    pos_embedding = [
        [i - ent_start for _, ent_start, _, _ in entities]
        for i in range(len(tokens))
    ]
    
    # Extract unique entity labels
    entity_labels = {ent.label_ for ent in doc.ents}
    
    # Return results
    return {
        "tokens": tokens,
        "token_ids": token_ids,
        "entities": entities,
        "position_embeddings": pos_embedding,
        "entity_labels": list(entity_labels)
    }

In [30]:
# Let's see an example
sentence = """Apple Inc. is a technology company headquartered in Cupertino, California. 
Founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in 1976, 
it has become one of the leading innovators in consumer electronics. 
Apple is known for its iPhone, iPad, and Mac products. 
The company is also involved in the development of software and services such as iCloud and Apple Music. 
In recent years, Apple has faced scrutiny over issues related to data privacy and labor practices in its supply chain.
"""
preprocess_sentence(sentence)

{'tokens': ['Apple',
  'Inc.',
  'is',
  'a',
  'technology',
  'company',
  'headquartered',
  'in',
  'Cupertino',
  ',',
  'California',
  '.',
  '\n',
  'Founded',
  'by',
  'Steve',
  'Jobs',
  ',',
  'Steve',
  'Wozniak',
  ',',
  'and',
  'Ronald',
  'Wayne',
  'in',
  '1976',
  ',',
  '\n',
  'it',
  'has',
  'become',
  'one',
  'of',
  'the',
  'leading',
  'innovators',
  'in',
  'consumer',
  'electronics',
  '.',
  '\n',
  'Apple',
  'is',
  'known',
  'for',
  'its',
  'iPhone',
  ',',
  'iPad',
  ',',
  'and',
  'Mac',
  'products',
  '.',
  '\n',
  'The',
  'company',
  'is',
  'also',
  'involved',
  'in',
  'the',
  'development',
  'of',
  'software',
  'and',
  'services',
  'such',
  'as',
  'iCloud',
  'and',
  'Apple',
  'Music',
  '.',
  '\n',
  'In',
  'recent',
  'years',
  ',',
  'Apple',
  'has',
  'faced',
  'scrutiny',
  'over',
  'issues',
  'related',
  'to',
  'data',
  'privacy',
  'and',
  'labor',
  'practices',
  'in',
  'its',
  'supply',
  'chain'

### Split the data

In [33]:
X_train, X_test = train_test_split(data['text'], test_size=0.2, random_state=42)

### Transform the data

In [36]:
vect = TfidfVectorizer()

In [38]:
X_train_vect = vect.fit_transform(X_train)

In [40]:
X_test_vect = vect.transform(X_test)

In [42]:
pca = PCA(n_components=2)
X_train_red = pca.fit_transform(X_train_vect.toarray())
X_test_red = pca.transform(X_test_vect.toarray())

In [48]:
import xml.etree.ElementTree as ET

# Load the XML file
tree = ET.parse('C:/Personal/ML/Relation Extraction Project/data/raw_data/CVPR_2003_30_abs.txt.xml')
root = tree.getroot()

# Function to extract tokens, NER, and dependencies
def extract_data(root):
    tokens = []
    entities = []
    dependencies = []
    
    for sentence in root.findall(".//sentence"):
        sent_tokens = []
        sent_entities = []
        sent_dependencies = []
        
        # Extract tokens and entities
        for token in sentence.findall(".//token"):
            word = token.find("word").text
            ner = token.find("NER").text
            sent_tokens.append(word)
            if ner != 'O':  # 'O' indicates no entity
                sent_entities.append((word, ner))
        
        # Extract dependencies
        for dep in sentence.findall(".//dependencies[@type='basic-dependencies']//dep"):
            governor = dep.find("governor").text
            dependent = dep.find("dependent").text
            relation = dep.get("type")
            sent_dependencies.append((governor, dependent, relation))
        
        tokens.append(sent_tokens)
        entities.append(sent_entities)
        dependencies.append(sent_dependencies)
    
    return tokens, entities, dependencies

# Extract data from the XML
tokens, entities, dependencies = extract_data(root)

# Print a sample of extracted data
print("Sample Tokens:", tokens[0])
print("Sample Entities:", entities[0])
print("Sample Dependencies:", dependencies[0])


Sample Tokens: ['This', 'paper', 'presents', 'a', 'novel', 'representation', 'for', 'three-dimensional', 'objects', 'in', 'terms', 'of', 'affine-invariant', 'image', 'patches', 'and', 'their', 'spatial', 'relationships', '.']
Sample Entities: []
Sample Dependencies: [('ROOT', 'presents', 'root'), ('paper', 'This', 'det'), ('presents', 'paper', 'nsubj'), ('representation', 'a', 'det'), ('representation', 'novel', 'amod'), ('presents', 'representation', 'dobj'), ('representation', 'for', 'prep'), ('objects', 'three-dimensional', 'amod'), ('for', 'objects', 'pobj'), ('representation', 'in', 'prep'), ('in', 'terms', 'pobj'), ('terms', 'of', 'prep'), ('patches', 'affine-invariant', 'amod'), ('patches', 'image', 'nn'), ('of', 'patches', 'pobj'), ('patches', 'and', 'cc'), ('relationships', 'their', 'poss'), ('relationships', 'spatial', 'amod'), ('patches', 'relationships', 'conj'), ('presents', '.', 'punct')]
