In [16]:
import pandas as pd
import nltk
import ast
import sys
from pathlib import Path
from nltk.metrics import jaccard_distance
from nltk.util import ngrams
from typing import List, Set
from scipy.stats import pearsonr
import numpy as np
from nltk.chunk import RegexpParser
import copy

from nltk.corpus import wordnet_ic
from nltk.corpus import wordnet as wn
import pandas as pd
import stanza


stanza.download('en')
nlp_stanza = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse', tokenize_pretokenized=True)
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('wordnet_ic')

brown_ic = wordnet_ic.ic('ic-brown.dat')

# Download required resource
nltk.download('averaged_perceptron_tagger_eng')

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 32.4MB/s]                    
2024-12-09 19:05:38 INFO: Downloaded file to C:\Users\maric\stanza_resources\resources.json
2024-12-09 19:05:38 INFO: Downloading default packages for language: en (English) ...
2024-12-09 19:05:39 INFO: File exists: C:\Users\maric\stanza_resources\en\default.zip
2024-12-09 19:05:43 INFO: Finished downloading models and saved to C:\Users\maric\stanza_resources
2024-12-09 19:05:43 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 33.5MB/s]                    
2024-12-09 19:05:43 INFO: Downloaded file to C:\Users\maric\stanza_resources\resources.json
2024-12-09 19:05:44 INFO: Loading these

In [17]:
# Load the training dataset
train_token_df = pd.read_csv('../Preprocessing/STS_train.csv')

# Display the DataFrame
train_token_df.head()

Unnamed: 0,0,1,gs
0,"['But', 'other', 'sources', 'close', 'to', 'th...","['But', 'other', 'sources', 'close', 'to', 'th...",4.0
1,"['Micron', 'has', 'declared', 'its', 'first', ...","['Micron', ""'s"", 'numbers', 'also', 'marked', ...",3.75
2,"['The', 'fines', 'are', 'part', 'of', 'failed'...","['Perry', 'said', 'he', 'backs', 'the', 'Senat...",2.8
3,"['The', 'American', 'Anglican', 'Council', ','...","['The', 'American', 'Anglican', 'Council', ','...",3.4
4,"['The', 'tech-loaded', 'Nasdaq', 'composite', ...","['The', 'technology-laced', 'Nasdaq', 'Composi...",2.4


In [18]:
# Turn the 2 first columns from strings to actual lists of strings

train_df = pd.DataFrame(columns=['0','1','gs'], index=range(2234))
train_df.iloc[:, :2] = train_token_df.iloc[:, :2].map(ast.literal_eval)

train_df.head()

Unnamed: 0,0,1,gs
0,"[But, other, sources, close, to, the, sale, sa...","[But, other, sources, close, to, the, sale, sa...",
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, 's, numbers, also, marked, the, first...",
2,"[The, fines, are, part, of, failed, Republican...","[Perry, said, he, backs, the, Senate, 's, effo...",
3,"[The, American, Anglican, Council, ,, which, r...","[The, American, Anglican, Council, ,, which, r...",
4,"[The, tech-loaded, Nasdaq, composite, rose, 20...","[The, technology-laced, Nasdaq, Composite, Ind...",


In [19]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Remove punctuation
train_df = preprocessor.remove_punctuation(train_df)
train_df = preprocessor.remove_empty_strings(train_df)

# POS-tagging the words
n=len(train_df)
train_df_POS = pd.DataFrame(columns=['0','1'])

for i in range(n):
    train_df_POS.loc[i,'0'] = nltk.pos_tag(train_df.loc[i,'0']) 
    train_df_POS.loc[i,'1'] = nltk.pos_tag(train_df.loc[i,'1']) 

train_df_POS.head()


Unnamed: 0,0,1
0,"[(But, CC), (other, JJ), (sources, NNS), (clos...","[(But, CC), (other, JJ), (sources, NNS), (clos..."
1,"[(Micron, NNP), (has, VBZ), (declared, VBN), (...","[(Micron, NNP), (s, NN), (numbers, NNS), (also..."
2,"[(The, DT), (fines, NNS), (are, VBP), (part, N...","[(Perry, NNP), (said, VBD), (he, PRP), (backs,..."
3,"[(The, DT), (American, JJ), (Anglican, NNP), (...","[(The, DT), (American, JJ), (Anglican, NNP), (..."
4,"[(The, DT), (tech-loaded, JJ), (Nasdaq, NNP), ...","[(The, DT), (technology-laced, JJ), (Nasdaq, N..."


In [20]:
# the function words (prepositions, conjunctions, articles) carry less semantics than content words 
# and thus removing them might eliminate the noise and provide a more accurate estimate of semantic similarity.

function_words_tag = {'IN', 'CC', 'DT', 'PDT', 'WDT'}

# Create a deep copy of the DataFrame
train_df_POS_bis = copy.deepcopy(train_df_POS)

# Iterate through the rows and modify columns '0' and '1'
for i in range(n):
    for tag in function_words_tag:
        # Extract, modify, and reassign the list in column '0'
        col_0 = train_df_POS_bis.at[i, '0']
        train_df_POS_bis.at[i, '0'] = [item for item in col_0 if item[1] != tag]

        # Extract, modify, and reassign the list in column '1'
        col_1 = train_df_POS_bis.at[i, '1']
        train_df_POS_bis.at[i, '1'] = [item for item in col_1 if item[1] != tag]

train_df_POS_bis.head()



Unnamed: 0,0,1
0,"[(other, JJ), (sources, NNS), (close, RB), (to...","[(other, JJ), (sources, NNS), (close, RB), (to..."
1,"[(Micron, NNP), (has, VBZ), (declared, VBN), (...","[(Micron, NNP), (s, NN), (numbers, NNS), (also..."
2,"[(fines, NNS), (are, VBP), (part, NN), (failed...","[(Perry, NNP), (said, VBD), (he, PRP), (backs,..."
3,"[(American, JJ), (Anglican, NNP), (Council, NN...","[(American, JJ), (Anglican, NNP), (Council, NN..."
4,"[(tech-loaded, JJ), (Nasdaq, NNP), (composite,...","[(technology-laced, JJ), (Nasdaq, NNP), (Compo..."


In [21]:
train_df_1 = pd.DataFrame(columns=['0','1','gs'])

for i in range(n):
    sentence=[]
    for j in range(len(train_df_POS_bis.loc[i,'0'])):
        sentence.append(train_df_POS_bis.loc[i,'0'][j][0])
    train_df_1.loc[i,'0']=sentence
    sentence=[]
    for k in range(len(train_df_POS_bis.loc[i,'1'])):
        sentence.append(train_df_POS_bis.loc[i,'1'][k][0])
    train_df_1.loc[i,'1']=sentence

train_df_1['gs'] = train_df['gs']
train_df_1.head(10)

Unnamed: 0,0,1,gs
0,"[other, sources, close, to, sale, said, Vivend...","[other, sources, close, to, sale, said, Vivend...",
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, s, numbers, also, marked, first, quar...",
2,"[fines, are, part, failed, Republican, efforts...","[Perry, said, he, backs, Senate, s, efforts, i...",
3,"[American, Anglican, Council, represents, Epis...","[American, Anglican, Council, represents, Epis...",
4,"[tech-loaded, Nasdaq, composite, rose, 20.96, ...","[technology-laced, Nasdaq, Composite, Index, I...",
5,"[Amgen, shares, gained, 93, cents, 1.45, perce...","[Shares, Allergan, were, up, 14, cents, 78.40,...",
6,"[U.S, prosecutors, have, arrested, more, 130, ...","[More, 130, people, have, been, arrested, 17, ...",
7,"[Chavez, said, investigators, feel, confident,...","[Albuquerque, Mayor, Martin, Chavez, said, inv...",
8,"[Authorities, said, scientist, properly, quara...","[scientist, also, quarantined, himself, home, ...",
9,"[support, will, come, free, software, upgrade,...","[upgrade, will, be, available, free, download,...",


In [22]:
def n_gram_overlap(tokens1: List[str], tokens2: List[str], n: int) -> float:
    """
    Computes the n-gram overlap between two tokenized sentences.

    Parameters:
        tokens1 (List[str]): Tokenized first sentence as a list of strings.
        tokens2 (List[str]): Tokenized second sentence as a list of strings.
        n (int): The size of n-grams.

    Returns:
        float: The n-gram overlap ratio.
    """
    def generate_ngrams(tokens: List[str], n: int) -> Set[str]:
        """
        Generates n-grams for a given list of tokens.

        """
        return set([' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)])

    # Generate n-grams for both token lists
    ngrams_s1 = generate_ngrams(tokens1, n)
    ngrams_s2 = generate_ngrams(tokens2, n)

    # Compute the intersection 
    intersection = ngrams_s1.intersection(ngrams_s2)

    # Compute the n gram overlap when posible
    if len(intersection)==0:
        ngo=0
    else:
        ngo=2/((len(ngrams_s1)+len(ngrams_s2))/len(intersection))

    return float(ngo)


In [23]:
syntactic_features=pd.DataFrame(columns=['POS_tagging_unigrams','POS_tagging_bigrams','POS_tagging_trigrams'])

for i in range(n):
    # unigrams
    syntactic_features.loc[i,'POS_tagging_unigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],1)
    # bigrams
    syntactic_features.loc[i,'POS_tagging_bigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],2)
    # trigrams
    syntactic_features.loc[i,'POS_tagging_trigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],3)


# Convert all columns in a DataFrame to numeric, coercing errors into NaN.
syntactic_features['POS_tagging_unigrams'] = pd.to_numeric(syntactic_features['POS_tagging_unigrams'], errors='coerce') 
syntactic_features['POS_tagging_bigrams'] = pd.to_numeric(syntactic_features['POS_tagging_bigrams'], errors='coerce') 
syntactic_features['POS_tagging_trigrams'] = pd.to_numeric(syntactic_features['POS_tagging_trigrams'], errors='coerce') 

syntactic_features.head()
    

Unnamed: 0,POS_tagging_unigrams,POS_tagging_bigrams,POS_tagging_trigrams
0,0.702703,0.594595,0.514286
1,0.571429,0.421053,0.352941
2,0.5,0.25,0.090909
3,0.777778,0.764706,0.75
4,0.230769,0.0,0.0


In [24]:
# CLASSIFY WORDS IN SET OF 's','p' or 'o'

def classify_syntactic_roles(doc):
    roles = {'p': [], 's': [], 'o': []}  # Initialize lists for multiple sets of p, s, o   
    for sentence in doc.sentences:        
        # Initialize temporary sets for predicates, subjects, and objects
        temp_p = set()  # predicates
        subject_sets = []  # List of sets for subjects
        object_sets = []  # List of sets for objects
        
        prep_objs = {}  # To store prepositional objects linked to the same predicate
        
        # Process each word in the sentence
        for word in sentence.words:
            if word.deprel == 'root':  # The predicate (main verb)
                temp_p.add(word.text)
            elif word.deprel in ['nsubj', 'nsubjpass']:  # Subject
                # Check if it belongs to an existing set
                head_text = sentence.words[word.head - 1].text if word.head > 0 else None
                added = False
                for s_set in subject_sets:
                    if any(head_text == sentence.words[s.head - 1].text for s in sentence.words if s.text in s_set):
                        s_set.add(word.text)
                        added = True
                        break
                if not added:
                    subject_sets.append({word.text})  # Create a new set
            elif word.deprel in ['obj', 'dobj', 'obl', 'case']:  # Object
                object_sets.append({word.text})  # Create a set for this object
            elif word.deprel == 'conj':  # Conjunction linking words
                head = sentence.words[word.head - 1].text  # The word it is conjoined with
                # If head is a subject, add the conjunct word to the subject set
                for s_set in subject_sets:
                    if head in s_set:
                        s_set.add(word.text)
                        break
                # If head is an object, add the conjunct word to the object set
                for obj_set in object_sets:
                    if head in obj_set:
                        obj_set.add(word.text)
                        break
                # If head is a predicate, add the conjunct word to the predicate set
                if head in temp_p: # coordinated structure
                    temp_p.add(word.text)
            elif word.deprel == 'ccomp': # subordinate clausures
                head = sentence.words[word.head - 1].text  # The word it is conjoined with
                if head in temp_p:
                    temp_p.add(word.text)

        # EXTRA: Merge object sets based on the specified rules
        merged_object_sets = []
        while object_sets:
            current_set = object_sets.pop(0)
            merged = False
            for other_set in object_sets:
                # Rule 1: Check if any 'case' word's head is in another set
                for word in current_set:
                    for other_word in sentence.words:
                        if other_word.text == word and other_word.deprel == 'case':
                            head_word = sentence.words[other_word.head - 1] if other_word.head > 0 else None
                            if head_word and any(head_word.text in s for s in object_sets):
                                other_set.update(current_set)
                                merged = True
                                break
                    if merged:
                        break
                if merged:
                    break
                # Check if two words in different sets share the same head
                for word1 in current_set:
                    for word2 in other_set:
                        word1_head = next((w.head for w in sentence.words if w.text == word1), None)
                        word2_head = next((w.head for w in sentence.words if w.text == word2), None)
                        if word1_head and word2_head and word1_head == word2_head:
                            other_set.update(current_set)
                            merged = True
                            break
                if merged:
                    break
            if not merged:
                merged_object_sets.append(current_set)

        # Assign to the roles dictionary
        for predicate in temp_p:
            roles['p'].append({predicate})  # Create a set for each predicate
        
        for s_set in subject_sets:
            roles['s'].append(s_set)  # Group subjects as their respective sets
        
        for obj_set in merged_object_sets:
            roles['o'].append(obj_set)  # Group objects as their respective sets

            # Replace words in the sets with their lemmas
        for role in roles:
            for idx, word_set in enumerate(roles[role]):
                roles[role][idx] = {sentence.words[next(i for i, w in enumerate(sentence.words) if w.text == word)].lemma 
                                    for word in word_set}

        return roles


In [38]:
# Wordnet tags
d = {'NN': 'n', 'NNS': 'n',
       'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
       'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
       'RB': 'r', 'RBR': 'r', 'RBS': 'r'}


# function to obtain a synset from a word.
def extract_synset(w):
  pair=nltk.pos_tag([w])
  if pair[0][1] in d.keys(): # Check if has a wordnet tag
    word_synsets = wn.synsets(w,d[pair[0][1]])
    return word_synsets[0]
  
  else:
    print('The word ',w,' has no wordnet tag.')
    return False


def chunksim(c1,c2):
    # compute lin similarity first
    sim_score=0
    for l1 in c1:
        for l2 in c2:
            synset_l1=extract_synset(l1)
            synset_l2=extract_synset(l2)
            if synset_l1.pos()==synset_l2.pos():

                # Calculate Lin Similarity
                sim_score += synset_l1.lin_similarity(synset_l2, brown_ic)
    if sim_score==0:
        return 0
    else:
        ckn1=sim_score/len(c1)
        ckn2=sim_score/len(c2)

        return 2*ckn1*ckn2/(ckn1+ckn2) #harmonic mean 
    


In [40]:
# chunks

chunk_sim_df = pd.DataFrame(columns=['Chunk_Sim'], index=range(n))

for k in range(1):
    # Classify the words into p, s, o for each sentence 
    roles_0 = classify_syntactic_roles(nlp_stanza([train_df.loc[k,'0']]))
    roles_1 = classify_syntactic_roles(nlp_stanza([train_df.loc[k,'1']]))
    # compute the simchunk for each pair of sentence and save only the sim value 

In [41]:
roles_0['p'], roles_1['p']

([{'say'}, {'hope'}, {'keep'}], [{'say'}, {'keep'}])

In [42]:
predicates_0= roles_0['p']
predicates_1=roles_1['p']

In [None]:
len0=len(predicates_0)
len1=len(predicates_1)

linsim_ = np.zeros((len0,len1))

for i in range(len0):
    for j in range(len1):
        linsim_[i,j] = chunksim(predicates_0[i],predicates_1[j])

if len0 > len1:
    max_val=np.zeros(len1)
    # si el numero de filas es mayor que el numero de columnas, tomo el maximo de cada columna
    for j in range(len1):
        # Encuentra el valor máximo en la columna j
        max_val[j] = np.max(linsim_[:, :])
        print(np.argmax(linsim_[:, :])) # no devuele los dos indices solo uno mirar q esta haciendo esto
        # la columna donde he encontrado el primer máximo se vuelve cero
        #linsim_[:,np.argmax(linsim_[:, :])[1]]=0
    # lleno max_val con los valores maximos
    sim_p=np.mean(max_val)
    
else:
    #tomo el maximo de cada fila
    max_val=np.zeros(len0)
    # si el numero de filas es mayor que el numero de columnas, tomo el maximo de cada columna
    for i in range(len0):
        # Encuentra el valor máximo en la columna j
        max_val[i] = np.max(linsim_[:, :])

        # la fila donde he encontrado el primer máximo se vuelve cero
        linsim_[np.argmax(linsim_[:, :])[0],:]=0
        
    # lleno max_val con los valores maximos
    sim_p=np.mean(max_val)

max_val

0
0


array([1., 1.])