In [39]:
import pandas as pd
import nltk
import ast
import sys
from pathlib import Path
from nltk.metrics import jaccard_distance
from nltk.util import ngrams
from typing import List, Set
from scipy.stats import pearsonr
import numpy as np
from nltk.chunk import RegexpParser
import copy
from math import log

from nltk.corpus import wordnet_ic
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import WordNetError  # Import WordNetError
import pandas as pd
import stanza


stanza.download('en')
nlp_stanza = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse', tokenize_pretokenized=True)
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('wordnet_ic')

brown_ic = wordnet_ic.ic('ic-brown.dat')

# Download required resource
nltk.download('averaged_perceptron_tagger_eng')

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 13.1MB/s]                    
2024-12-10 17:11:28 INFO: Downloaded file to C:\Users\maric\stanza_resources\resources.json
2024-12-10 17:11:28 INFO: Downloading default packages for language: en (English) ...
2024-12-10 17:11:30 INFO: File exists: C:\Users\maric\stanza_resources\en\default.zip
2024-12-10 17:11:35 INFO: Finished downloading models and saved to C:\Users\maric\stanza_resources
2024-12-10 17:11:35 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 26.9MB/s]                    
2024-12-10 17:11:35 INFO: Downloaded file to C:\Users\maric\stanza_resources\resources.json
2024-12-10 17:11:36 INFO: Loading these

In [40]:
# Load the training dataset
train_token_df = pd.read_csv('../Preprocessing/STS_train.csv')

# Display the DataFrame
train_token_df.head()

Unnamed: 0,0,1,gs
0,"['But', 'other', 'sources', 'close', 'to', 'th...","['But', 'other', 'sources', 'close', 'to', 'th...",4.0
1,"['Micron', 'has', 'declared', 'its', 'first', ...","['Micron', ""'s"", 'numbers', 'also', 'marked', ...",3.75
2,"['The', 'fines', 'are', 'part', 'of', 'failed'...","['Perry', 'said', 'he', 'backs', 'the', 'Senat...",2.8
3,"['The', 'American', 'Anglican', 'Council', ','...","['The', 'American', 'Anglican', 'Council', ','...",3.4
4,"['The', 'tech-loaded', 'Nasdaq', 'composite', ...","['The', 'technology-laced', 'Nasdaq', 'Composi...",2.4


In [None]:
# Turn the 2 first columns from strings to actual lists of strings

train_df = pd.DataFrame(columns=['0','1','gs'], index=range(n))
train_df.iloc[:, :2] = train_token_df.iloc[:, :2].map(ast.literal_eval)
train_df.loc[:, 'gs'] = train_token_df.loc[:, 'gs']

train_df.head()

Unnamed: 0,0,1,gs
0,"[But, other, sources, close, to, the, sale, sa...","[But, other, sources, close, to, the, sale, sa...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, 's, numbers, also, marked, the, first...",3.75
2,"[The, fines, are, part, of, failed, Republican...","[Perry, said, he, backs, the, Senate, 's, effo...",2.8
3,"[The, American, Anglican, Council, ,, which, r...","[The, American, Anglican, Council, ,, which, r...",3.4
4,"[The, tech-loaded, Nasdaq, composite, rose, 20...","[The, technology-laced, Nasdaq, Composite, Ind...",2.4


In [42]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Remove punctuation
train_df = preprocessor.remove_punctuation(train_df)
train_df = preprocessor.convert_to_lowercase(train_df)
train_df = preprocessor.remove_empty_strings(train_df)

# POS-tagging the words
n=len(train_df)
train_df_POS = pd.DataFrame(columns=['0','1'])

for i in range(n):
    train_df_POS.loc[i,'0'] = nltk.pos_tag(train_df.loc[i,'0']) 
    train_df_POS.loc[i,'1'] = nltk.pos_tag(train_df.loc[i,'1']) 

train_df_POS.head()


Unnamed: 0,0,1
0,"[(but, CC), (other, JJ), (sources, NNS), (clos...","[(but, CC), (other, JJ), (sources, NNS), (clos..."
1,"[(micron, NN), (has, VBZ), (declared, VBN), (i...","[(micron, NN), (s, NN), (numbers, NNS), (also,..."
2,"[(the, DT), (fines, NNS), (are, VBP), (part, N...","[(perry, NN), (said, VBD), (he, PRP), (backs, ..."
3,"[(the, DT), (american, JJ), (anglican, NN), (c...","[(the, DT), (american, JJ), (anglican, NN), (c..."
4,"[(the, DT), (tech-loaded, JJ), (nasdaq, NN), (...","[(the, DT), (technology-laced, JJ), (nasdaq, N..."


* N-GRAMS OVERLAP REMOVING SOME FUNCTION WORDS

In [None]:
# the function words (prepositions, conjunctions, articles) carry less meaning than the content words,
# and thus removing them might eliminate the noise and provide a more accurate estimate of semantic similarity.

# the use of Part of speech (POS) tagging is a critical component of syntactic analysis because it involves 
# identifying the grammatical roles of words (e.g., noun, verb, adjective) within a sentence.

function_words_tag = {'IN', 'CC', 'DT', 'PDT', 'WDT'}

# Create a deep copy of the DataFrame
train_df_POS_bis = copy.deepcopy(train_df_POS)

# Iterate through the rows and modify columns '0' and '1'
for i in range(n):
    for tag in function_words_tag:
        # Extract, modify, and reassign the list in column '0'
        col_0 = train_df_POS_bis.at[i, '0']
        train_df_POS_bis.at[i, '0'] = [item for item in col_0 if item[1] != tag]

        # Extract, modify, and reassign the list in column '1'
        col_1 = train_df_POS_bis.at[i, '1']
        train_df_POS_bis.at[i, '1'] = [item for item in col_1 if item[1] != tag]

train_df_POS_bis.head()



Unnamed: 0,0,1
0,"[(other, JJ), (sources, NNS), (close, RB), (to...","[(other, JJ), (sources, NNS), (close, RB), (to..."
1,"[(Micron, NNP), (has, VBZ), (declared, VBN), (...","[(Micron, NNP), (s, NN), (numbers, NNS), (also..."
2,"[(fines, NNS), (are, VBP), (part, NN), (failed...","[(Perry, NNP), (said, VBD), (he, PRP), (backs,..."
3,"[(American, JJ), (Anglican, NNP), (Council, NN...","[(American, JJ), (Anglican, NNP), (Council, NN..."
4,"[(tech-loaded, JJ), (Nasdaq, NNP), (composite,...","[(technology-laced, JJ), (Nasdaq, NNP), (Compo..."


In [9]:
train_df_1 = pd.DataFrame(columns=['0','1','gs'])

for i in range(n):
    sentence=[]
    for j in range(len(train_df_POS_bis.loc[i,'0'])):
        sentence.append(train_df_POS_bis.loc[i,'0'][j][0])
    train_df_1.loc[i,'0']=sentence
    sentence=[]
    for k in range(len(train_df_POS_bis.loc[i,'1'])):
        sentence.append(train_df_POS_bis.loc[i,'1'][k][0])
    train_df_1.loc[i,'1']=sentence

train_df_1['gs'] = train_df['gs']
train_df_1.head(10)

Unnamed: 0,0,1,gs
0,"[other, sources, close, to, sale, said, Vivend...","[other, sources, close, to, sale, said, Vivend...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, s, numbers, also, marked, first, quar...",3.75
2,"[fines, are, part, failed, Republican, efforts...","[Perry, said, he, backs, Senate, s, efforts, i...",2.8
3,"[American, Anglican, Council, represents, Epis...","[American, Anglican, Council, represents, Epis...",3.4
4,"[tech-loaded, Nasdaq, composite, rose, 20.96, ...","[technology-laced, Nasdaq, Composite, Index, I...",2.4
5,"[Amgen, shares, gained, 93, cents, 1.45, perce...","[Shares, Allergan, were, up, 14, cents, 78.40,...",1.333
6,"[U.S, prosecutors, have, arrested, more, 130, ...","[More, 130, people, have, been, arrested, 17, ...",4.6
7,"[Chavez, said, investigators, feel, confident,...","[Albuquerque, Mayor, Martin, Chavez, said, inv...",3.8
8,"[Authorities, said, scientist, properly, quara...","[scientist, also, quarantined, himself, home, ...",4.2
9,"[support, will, come, free, software, upgrade,...","[upgrade, will, be, available, free, download,...",2.6


In [10]:
def n_gram_overlap(tokens1: List[str], tokens2: List[str], n: int) -> float:
    """
    Computes the n-gram overlap between two tokenized sentences.

    Parameters:
        tokens1 (List[str]): Tokenized first sentence as a list of strings.
        tokens2 (List[str]): Tokenized second sentence as a list of strings.
        n (int): The size of n-grams.

    Returns:
        float: The n-gram overlap ratio.
    """
    def generate_ngrams(tokens: List[str], n: int) -> Set[str]:
        """
        Generates n-grams for a given list of tokens.

        """
        return set([' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)])

    # Generate n-grams for both token lists
    ngrams_s1 = generate_ngrams(tokens1, n)
    ngrams_s2 = generate_ngrams(tokens2, n)

    # Compute the intersection 
    intersection = ngrams_s1.intersection(ngrams_s2)

    # Compute the n gram overlap when posible
    if len(intersection)==0:
        ngo=0
    else:
        ngo=2/((len(ngrams_s1)+len(ngrams_s2))/len(intersection))

    return float(ngo)


In [11]:
syntactic_features=pd.DataFrame(columns=['POS_tagging_unigrams','POS_tagging_bigrams','POS_tagging_trigrams'])

for i in range(n):
    # unigrams
    syntactic_features.loc[i,'POS_tagging_unigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],1)
    # bigrams
    syntactic_features.loc[i,'POS_tagging_bigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],2)
    # trigrams
    syntactic_features.loc[i,'POS_tagging_trigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],3)


# Convert all columns in a DataFrame to numeric, coercing errors into NaN.
syntactic_features['POS_tagging_unigrams'] = pd.to_numeric(syntactic_features['POS_tagging_unigrams'], errors='coerce') 
syntactic_features['POS_tagging_bigrams'] = pd.to_numeric(syntactic_features['POS_tagging_bigrams'], errors='coerce') 
syntactic_features['POS_tagging_trigrams'] = pd.to_numeric(syntactic_features['POS_tagging_trigrams'], errors='coerce') 

syntactic_features.head()
    

Unnamed: 0,POS_tagging_unigrams,POS_tagging_bigrams,POS_tagging_trigrams
0,0.702703,0.594595,0.514286
1,0.571429,0.421053,0.352941
2,0.5,0.25,0.090909
3,0.777778,0.764706,0.75
4,0.230769,0.0,0.0


* SYNTACTIC ROLES SIMILARITY

In [None]:
# CLASSIFY WORDS IN SET OF 's','p' or 'o' AND CONVERT THE WORDS INTO LEMMAS

def classify_syntactic_roles(doc):
    roles = {'p': [], 's': [], 'o': []}  # Initialize lists for multiple sets of p, s, o   
    for sentence in doc.sentences:        
        # Initialize temporary sets for predicates, subjects, and objects
        temp_p = set()  # predicates
        subject_sets = []  # List of sets for subjects
        object_sets = []  # List of sets for objects
        
        prep_objs = {}  # To store prepositional objects linked to the same predicate
        
        # Process each word in the sentence
        for word in sentence.words:
            if word.deprel == 'root':  # The predicate (main verb)
                temp_p.add(word.text)
            elif word.deprel in ['nsubj', 'nsubjpass']:  # Subject
                # Check if it belongs to an existing set
                head_text = sentence.words[word.head - 1].text if word.head > 0 else None
                added = False
                for s_set in subject_sets:
                    if any(head_text == sentence.words[s.head - 1].text for s in sentence.words if s.text in s_set):
                        s_set.add(word.text)
                        added = True
                        break
                if not added:
                    subject_sets.append({word.text})  # Create a new set
            elif word.deprel in ['obj', 'dobj', 'obl', 'case']:  # Object
                object_sets.append({word.text})  # Create a set for this object
            elif word.deprel == 'conj':  # Conjunction linking words
                head = sentence.words[word.head - 1].text  # The word it is conjoined with
                # If head is a subject, add the conjunct word to the subject set
                for s_set in subject_sets:
                    if head in s_set:
                        s_set.add(word.text)
                        break
                # If head is an object, add the conjunct word to the object set
                for obj_set in object_sets:
                    if head in obj_set:
                        obj_set.add(word.text)
                        break
                # If head is a predicate, add the conjunct word to the predicate set
                if head in temp_p: # coordinated structure
                    temp_p.add(word.text)
            elif word.deprel == 'ccomp': # subordinate clausures
                head = sentence.words[word.head - 1].text  # The word it is conjoined with
                if head in temp_p:
                    temp_p.add(word.text)

        # EXTRA: Merge object sets based on the specified rules
        merged_object_sets = []
        while object_sets:
            current_set = object_sets.pop(0)
            merged = False
            for other_set in object_sets:
                # Rule 1: Check if any 'case' word's head is in another set
                for word in current_set:
                    for other_word in sentence.words:
                        if other_word.text == word and other_word.deprel == 'case':
                            head_word = sentence.words[other_word.head - 1] if other_word.head > 0 else None
                            if head_word and any(head_word.text in s for s in object_sets):
                                other_set.update(current_set)
                                merged = True
                                break
                    if merged:
                        break
                if merged:
                    break
                # Check if two words in different sets share the same head
                for word1 in current_set:
                    for word2 in other_set:
                        word1_head = next((w.head for w in sentence.words if w.text == word1), None)
                        word2_head = next((w.head for w in sentence.words if w.text == word2), None)
                        if word1_head and word2_head and word1_head == word2_head:
                            other_set.update(current_set)
                            merged = True
                            break
                if merged:
                    break
            if not merged:
                merged_object_sets.append(current_set)

        # Assign to the roles dictionary
        for predicate in temp_p:
            roles['p'].append({predicate})  # Create a set for each predicate
        
        for s_set in subject_sets:
            roles['s'].append(s_set)  # Group subjects as their respective sets
        
        for obj_set in merged_object_sets:
            roles['o'].append(obj_set)  # Group objects as their respective sets

            # Replace words in the sets with their lemmas
        for role in roles:
            for idx, word_set in enumerate(roles[role]):
                roles[role][idx] = {sentence.words[next(i for i, w in enumerate(sentence.words) if w.text == word)].lemma 
                                    for word in word_set}

        return roles


In [43]:
# function to obtain a synset from a lemma
def extract_synset(w):
    # Wordnet tags
  d = {'NN': 'n', 'NNS': 'n',
        'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
        'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
        'RB': 'r', 'RBR': 'r', 'RBS': 'r'}
  pair=nltk.pos_tag([w])
  if pair[0][1] in d.keys(): # Check if has a wordnet tag
    word_synsets = wn.synsets(w,d[pair[0][1]])
    if len(word_synsets)==0: # if the list of synsets is empty (e.g. proper nouns)
      return None 
    else: 
      return word_synsets[0]
  
  else:
    # 'The lemma w has no wordnet tag
    return None

# function to compute the similarity between two chunks
def chunksim(c1,c2):
    # compute lin similarity first
    sim_score=0
    for l1 in c1:
        for l2 in c2:
            synset_l1=extract_synset(l1)
            synset_l2=extract_synset(l2)

            if synset_l1 is None or synset_l2 is None:
               continue

            elif synset_l1.pos()==synset_l2.pos():
                try:
                  # Calculate Lin Similarity
                  sim_score += synset_l1.lin_similarity(synset_l2, brown_ic)
                except WordNetError as e:
                  continue

    if sim_score==0:
        return 0
    else:
        ckn1=sim_score/len(c1)
        ckn2=sim_score/len(c2)

        return 2*ckn1*ckn2/(ckn1+ckn2) #harmonic mean 
    


In [None]:
# function to obtain the chunk similarity for the pair of chunks of the same type with maximum similarity
def compute_similarity_chunks(predicates_0, predicates_1):
    
    len0=len(predicates_0)
    len1=len(predicates_1)

    if len0==0 or len1==0:
        return 0

    sim=0

    linsim_ = np.zeros((len0,len1))

    for i in range(len0):
        for j in range(len1):
            linsim_[i,j] = chunksim(predicates_0[i],predicates_1[j])

    if len0 > len1:
        max_val=np.zeros(len1)
        # if num columns > num of rows, then the number of max is equal the num of columns
        for j in range(len1):
            # I find the actual and valid max value and its coordinates

            max_val[j] = np.max(linsim_[:, :])
            max_coords = np.unravel_index(np.argmax(linsim_), linsim_.shape)

            # the column where the max value was founded becomes 0
            linsim_[:,max_coords[1]]=0
        # I compute the mean of all the pair of p
        sim=np.mean(max_val)
        
    else:
        max_val=np.zeros(len0)
        # if num rows > num of columns, then the number of max is equal the num of columns
        for i in range(len0):
            # I find the actual and valid max value and its coordinates
            max_val[i] = np.max(linsim_[:, :])
            max_coords = np.unravel_index(np.argmax(linsim_), linsim_.shape)

            # the row where the max value was founded becomes 0
            linsim_[max_coords[0],:]=0
            
        # I compute the mean of all the pair of p
        sim=np.mean(max_val)

    return sim

In [15]:
# compute the chunk similarity for the dataset 

chunk_sim_df = pd.DataFrame(columns=['chunk_sim_p','chunk_sim_s','chunk_sim_o','total_sim_chunks'], index=range(n))

for k in range(n):
    # Classify the words into p, s, o for each sentence 
    roles_0 = classify_syntactic_roles(nlp_stanza([train_df.loc[k,'0']]))
    roles_1 = classify_syntactic_roles(nlp_stanza([train_df.loc[k,'1']]))
    # compute the simchunk for each pair of sentence and save only the sim value 
    chunk_sim_df.loc[k,'chunk_sim_p']=compute_similarity_chunks(roles_0['p'], roles_1['p'])
    chunk_sim_df.loc[k,'chunk_sim_s']=compute_similarity_chunks(roles_0['s'], roles_1['s'])
    chunk_sim_df.loc[k,'chunk_sim_o']=compute_similarity_chunks(roles_0['o'], roles_1['o'])
    chunk_sim_df.loc[k,'total_sim_chunks'] = chunk_sim_df[['chunk_sim_p','chunk_sim_s','chunk_sim_o']].iloc[k].mean(axis=0)

syntactic_features['chunk_sim_p']=chunk_sim_df['chunk_sim_p']
syntactic_features['chunk_sim_s']=chunk_sim_df['chunk_sim_s']
syntactic_features['chunk_sim_o']=chunk_sim_df['chunk_sim_o']
syntactic_features['total_sim_chunks']=chunk_sim_df['total_sim_chunks']

syntactic_features.head()

Unnamed: 0,POS_tagging_unigrams,POS_tagging_bigrams,POS_tagging_trigrams,chunk_sim_p,chunk_sim_s,chunk_sim_o,total_sim_chunks
0,0.702703,0.594595,0.514286,1.0,0.5,0.174853,0.558284
1,0.571429,0.421053,0.352941,0.0,0.0,0.722495,0.240832
2,0.5,0.25,0.090909,0.0,0.0,0.0,0.0
3,0.777778,0.764706,0.75,1.0,0.333333,0.360534,0.564623
4,0.230769,0.0,0.0,0.0,0.0,0.304668,0.101556


* SYNTACTIC DEPENDENCIES OVERLAP

In [44]:
def get_dependency_relations(tokens):
    """
    Extract dependency relations between lemmas for a list of tokenized words.
    
    Args:
        tokens (list): List of tokenized words to analyze
        nlp (stanza.Pipeline, optional): Pre-initialized Stanza pipeline. 
                                         If None, will create a new English pipeline.
    
    Returns:
        list: A list of dictionaries containing dependency relations with:
            - 'type': dependency relation type
            - 'governor_lemma': lemma of the governing word
            - 'dependent_lemma': lemma of the dependent word
    """
    
    # Process the tokenized input
    doc = nlp_stanza(tokens)
    
    # List to store dependency relations
    relations = []
    
    # Extract dependency relations from the first (and typically only) sentence
    for sent in doc.sentences:
        for word in sent.words:
            # Skip root word (which has head 0)
            if word.head != 0:
                # Find the governor (head) word
                governor = sent.words[word.head - 1]
                
                # Add relation to list
                relations.append({
                    'type': word.deprel,
                    'governor_lemma': governor.lemma,
                    'dependent_lemma': word.lemma
                })
    
    return relations

In [45]:
# compute the max IC between the goberning word and the dependent from a list of dependency relations and sum all of them

def compute_max_ic_sum_with_wordnet_ic(dependency_relations):
    """
    Computes the sum of the maximum information content (IC) for each dependency relation
    from a list of dependency relations, using WordNet's IC resource.
    
    Args:
        dependency_relations (list of dicts): List of dependency relations, each represented by a dictionary 
                                               with 'governor_lemma' and 'dependent_lemma'.
        
    Returns:
        float: The sum of maximum IC values for all dependency relations.
    """
    # function to compute IC using WordNet's IC
    def compute_ic(word):
        synset = extract_synset(word)

        # Check if synset is None
        if synset is None:
            return 0  # Return default value for None synsets

        # Extract the part of speech
        pos = synset.pos()  # 'n', 'v', etc.
        if pos not in brown_ic:
            return 0  # POS not supported in IC data

        # Normalize the frequency to compute probability
        try:
            raw_count = brown_ic[pos][synset.offset()]
            total_count = sum(brown_ic[pos].values())  # Total frequency for this POS
            probability = raw_count / total_count
            if probability <= 0 or probability > 1:
                return 0
            ic = -log(probability)
            return ic
        except KeyError:
            return 0  # Synset not found in IC data
            
    # Process each dependency relation and compute the sum of max IC
    max_ic_sum = 0
    for relation in dependency_relations:
        governor = relation['governor_lemma']
        dependent = relation['dependent_lemma']
        
        # Compute IC for governing and dependent words using WordNet's IC
        ic_governing = compute_ic(governor)
        ic_dependent = compute_ic(dependent)
        
        # Save the maximum IC and add to the sum
        max_ic = max(ic_governing, ic_dependent)
        max_ic_sum += max_ic
    
    return max_ic_sum

In [None]:
sim_dep_overlap = pd.DataFrame(columns=['sim_dependencies'], index=range(n))


for k in range(n):
    relations_0 = get_dependency_relations([train_df.loc[k,'0']])
    relations_1 = get_dependency_relations([train_df.loc[k,'1']])
    # Compute the intersection
    relations_intersection = [item for item in relations_0 if item in relations_1]
    max_dependency_relations_0 = compute_max_ic_sum_with_wordnet_ic(relations_0)
    max_dependency_relations_1 = compute_max_ic_sum_with_wordnet_ic(relations_1)
    max_dependency_intersection = compute_max_ic_sum_with_wordnet_ic(relations_intersection)
    if max_dependency_relations_0 == 0:
        wdrx_1_0 = 0
    else:
        wdrc_1_0 = max_dependency_intersection/max_dependency_relations_0
    if max_dependency_relations_1 == 0:
        wdrc_0_1 = 0
    else:
        wdrc_0_1 = max_dependency_intersection/max_dependency_relations_1
    sim_dep_overlap.loc[k,'sim_dep_overlap'] = (wdrc_1_0 + wdrc_0_1)/2

syntactic_features['sim_dependencies'] = sim_dep_overlap['sim_dependencies']
syntactic_features.head()