In [None]:
import torch
from transformers import BertTokenizer, BertModel

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [None]:
def embed_text(input_text, tokenizer, model):
    # Tokenization
    tokens = tokenizer.tokenize(input_text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # Padding
    max_length = 20  # Maximum sequence length for BERT
    padded_token_ids = token_ids[:max_length]
    attention_mask = [1] * len(padded_token_ids)
    
    # Padding to match max_length
    while len(padded_token_ids) < max_length:
        padded_token_ids.append(0)
        attention_mask.append(0)
        
    # Convert to PyTorch tensors
    input_ids = torch.tensor([padded_token_ids])
    attention_mask = torch.tensor([attention_mask])
    
    # Model inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
    contextual_embeddings = outputs.last_hidden_state.squeeze(0)
    
    return contextual_embeddings

In [None]:
def context_words(input_text, tokenizer, window_size):
    # Tokenization
    tokens = tokenizer.tokenize(input_text)

    # Find the position of the target word
    target_word = "walk"
    word_index = tokens.index(target_word)

    # Set the window size for the context words
    window_size = 2

    # Calculate the starting and ending positions for the context window
    start_pos = max(0, word_index - window_size)
    end_pos = min(len(tokens) - 1, word_index + window_size)

    # Extract the context words
    context_words = tokens[start_pos:end_pos+1]

    return context_words


In [None]:
# Input text
input_text1 = "I love to go for a walk in the park. Parks are made for walking."
input_text2 = "parks are made for walking"

contextual_embeddings1 = embed_text(input_text1, tokenizer, model)
contextual_embeddings2 = embed_text(input_text2, tokenizer, model)


In [None]:
# Index of the word to analyze
word_index1 = 6  # Example: "walk"

# Get the contextualized representation for the word
word_embedding1 = contextual_embeddings1[word_index1]

word_index2 = 4 # park
word_embedding2 = contextual_embeddings2[word_index2]

# Print the contextualized representation
# print("Contextualized representation of the word:", word_embedding)

In [None]:
from scipy.spatial.distance import cosine
rep_1_np = word_embedding1.numpy()
rep_2_np = word_embedding2.numpy()

# Calculate cosine similarity
similarity = 1 - cosine(rep_1_np, rep_2_np)

# Print similarity score
print("Cosine similarity:", similarity)

In [None]:
words = context_words(input_text1, tokenizer, 2)
print(words)

In [None]:
import spacy
from transformers import BertTokenizer, BertModel

# Load the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Load the spaCy model for dependency parsing and named entity recognition
nlp = spacy.load('en_core_web_sm')

# Example text
text = "Italian government officials must maintain integrity in their actions."
window_size = 3

In [None]:
tokens = [token.text for token in nlp.tokenizer(text)]

In [None]:
# Tokenize the text with BERT
#tokens = tokenizer.tokenize(text)
# Convert tokens to BERT input format
#input_ids = tokenizer.convert_tokens_to_ids(tokens)
# Add special tokens [CLS] and [SEP]
#input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id]

In [None]:
# Obtain contextualized representations using BERT
#with torch.no_grad():
#    inputs = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
#    outputs = model(inputs)
#    contextualized_reps = outputs.last_hidden_state.squeeze(0)  # Remove batch dimension
    

In [None]:
# Extract the dependency parse and named entities using spaCy
doc = nlp(text)
entities = [(entity.text, entity.label_) for entity in doc.ents]
dependency_parse = [(token.text, token.dep_, token.head.text) for token in doc]

# Identify the context of "integrity" and its relationship with "government"
integrity_context = []
for i, token in enumerate(tokens):
    if token == 'integrity':
        # Get the surrounding tokens within a window size
        context_tokens = tokens[max(0, i - window_size): i] + tokens[i+1: i+window_size+1]
        integrity_context = [context_token for context_token in context_tokens]
        break

print("Integrity context:", integrity_context)
print("Named Entities:", entities)
print("Dependency Parse:", dependency_parse)
#print("Contextualized representations:", contextualized_reps)

In [None]:
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

# Example text
text = "government officials must maintain integrity in their actions."

# Process the text using the language model
doc = nlp(text)

subject_token = None

for token in doc:
    
    if token.text.lower() == "government" and token.dep_ == "nsubj":
       
        subject_token = token
        break

linked_word = None
if subject_token is not None:
    for token in doc:
        print(f'{token} {token.dep_} {token.head}')
        if token.dep_ == "dobj" and token.head == subject_token:
            linked_word = token
            break

if linked_word is not None:
    print("Linked Word:", linked_word.text)
    print("Dependency Relation:", linked_word.dep_)


In [None]:
# Define the desired context (e.g., "government" as the subject)
desired_context = "government"

# Initialize a counter for words with the desired context
count = 0

# Iterate over the tokens in the sentence
for token in doc:
    if token.text.lower() != desired_context:
        print(f'{token} {token.head}')
        if token.head.text.lower() == desired_context and token.dep_ != "conj":
            count += 1

print("Count of words with context '{}': {}".format(desired_context, count))

In [None]:


# Example sentence
text = "The government officials maintain integrity in their actions."

# Target word
target_word = "integrity"

# Process the text using the language model
doc = nlp(text)

# Find the subject tokens and their compounds
subject_tokens = []
for token in doc:
    if token.dep_ == "nsubj" or token.head.dep_ == "nsubj":
        subject_tokens.extend([token] + list(token.children))
        
print(subject_tokens)

# Find the subject token closest to the target word
closest_subject_token = None
min_distance = float('inf')

for subject_token in subject_tokens:
    distance = abs(subject_token.i - doc.vocab[target_word].orth)
    if distance < min_distance:
        closest_subject_token = subject_token
        min_distance = distance

print("Closest subject token to '{}' is '{}'".format(target_word, closest_subject_token.text))

