In [2]:
import pandas as pd
import nltk
import ast
import sys
from pathlib import Path
from nltk.metrics import jaccard_distance
from nltk.util import ngrams
from typing import List, Set
from scipy.stats import pearsonr
import numpy as np
from nltk.chunk import RegexpParser
import copy

from nltk.corpus import wordnet_ic
from nltk.corpus import wordnet as wn
import pandas as pd

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('wordnet_ic')

brown_ic = wordnet_ic.ic('ic-brown.dat')

# Download required resource
nltk.download('averaged_perceptron_tagger_eng')

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maric\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\maric\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\maric\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\maric\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
# Load the training dataset
train_df = pd.read_csv('../Preprocessing/STS_train.csv')

# Display the DataFrame
train_df.head()

Unnamed: 0,0,1,gs
0,"['But', 'other', 'sources', 'close', 'to', 'th...","['But', 'other', 'sources', 'close', 'to', 'th...",4.0
1,"['Micron', 'has', 'declared', 'its', 'first', ...","['Micron', ""'s"", 'numbers', 'also', 'marked', ...",3.75
2,"['The', 'fines', 'are', 'part', 'of', 'failed'...","['Perry', 'said', 'he', 'backs', 'the', 'Senat...",2.8
3,"['The', 'American', 'Anglican', 'Council', ','...","['The', 'American', 'Anglican', 'Council', ','...",3.4
4,"['The', 'tech-loaded', 'Nasdaq', 'composite', ...","['The', 'technology-laced', 'Nasdaq', 'Composi...",2.4


In [4]:
# Turn the 2 first columns from strings to actual lists of strings
train_df.iloc[:, :2] = train_df.iloc[:, :2].map(ast.literal_eval)

train_df.head()

Unnamed: 0,0,1,gs
0,"[But, other, sources, close, to, the, sale, sa...","[But, other, sources, close, to, the, sale, sa...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, 's, numbers, also, marked, the, first...",3.75
2,"[The, fines, are, part, of, failed, Republican...","[Perry, said, he, backs, the, Senate, 's, effo...",2.8
3,"[The, American, Anglican, Council, ,, which, r...","[The, American, Anglican, Council, ,, which, r...",3.4
4,"[The, tech-loaded, Nasdaq, composite, rose, 20...","[The, technology-laced, Nasdaq, Composite, Ind...",2.4


In [5]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Remove punctuation
train_df = preprocessor.remove_punctuation(train_df)
train_df = preprocessor.remove_empty_strings(train_df)

# POS-tagging the words

n=len(train_df)
train_df_POS = pd.DataFrame(columns=['0','1'])

for i in range(n):
    train_df_POS.loc[i,'0'] = nltk.pos_tag(train_df.loc[i,'0']) 
    train_df_POS.loc[i,'1'] = nltk.pos_tag(train_df.loc[i,'1']) 

train_df_POS.head()


Unnamed: 0,0,1
0,"[(But, CC), (other, JJ), (sources, NNS), (clos...","[(But, CC), (other, JJ), (sources, NNS), (clos..."
1,"[(Micron, NNP), (has, VBZ), (declared, VBN), (...","[(Micron, NNP), (s, NN), (numbers, NNS), (also..."
2,"[(The, DT), (fines, NNS), (are, VBP), (part, N...","[(Perry, NNP), (said, VBD), (he, PRP), (backs,..."
3,"[(The, DT), (American, JJ), (Anglican, NNP), (...","[(The, DT), (American, JJ), (Anglican, NNP), (..."
4,"[(The, DT), (tech-loaded, JJ), (Nasdaq, NNP), ...","[(The, DT), (technology-laced, JJ), (Nasdaq, N..."


In [6]:
# the function words (prepositions, conjunctions, articles) carry less semantics than content words 
# and thus removing them might eliminate the noise and provide a more accurate estimate of semantic similarity.

function_words_tag = {'IN', 'CC', 'DT', 'PDT', 'WDT'}

# Create a deep copy of the DataFrame
train_df_POS_bis = copy.deepcopy(train_df_POS)

# Iterate through the rows and modify columns '0' and '1'
for i in range(n):
    for tag in function_words_tag:
        # Extract, modify, and reassign the list in column '0'
        col_0 = train_df_POS_bis.at[i, '0']
        train_df_POS_bis.at[i, '0'] = [item for item in col_0 if item[1] != tag]

        # Extract, modify, and reassign the list in column '1'
        col_1 = train_df_POS_bis.at[i, '1']
        train_df_POS_bis.at[i, '1'] = [item for item in col_1 if item[1] != tag]

train_df_POS_bis.head()



Unnamed: 0,0,1
0,"[(other, JJ), (sources, NNS), (close, RB), (to...","[(other, JJ), (sources, NNS), (close, RB), (to..."
1,"[(Micron, NNP), (has, VBZ), (declared, VBN), (...","[(Micron, NNP), (s, NN), (numbers, NNS), (also..."
2,"[(fines, NNS), (are, VBP), (part, NN), (failed...","[(Perry, NNP), (said, VBD), (he, PRP), (backs,..."
3,"[(American, JJ), (Anglican, NNP), (Council, NN...","[(American, JJ), (Anglican, NNP), (Council, NN..."
4,"[(tech-loaded, JJ), (Nasdaq, NNP), (composite,...","[(technology-laced, JJ), (Nasdaq, NNP), (Compo..."


In [7]:
train_df_1 = pd.DataFrame(columns=['0','1','gs'])

for i in range(n):
    sentence=[]
    for j in range(len(train_df_POS_bis.loc[i,'0'])):
        sentence.append(train_df_POS_bis.loc[i,'0'][j][0])
    train_df_1.loc[i,'0']=sentence
    sentence=[]
    for k in range(len(train_df_POS_bis.loc[i,'1'])):
        sentence.append(train_df_POS_bis.loc[i,'1'][k][0])
    train_df_1.loc[i,'1']=sentence

train_df_1['gs'] = train_df['gs']
train_df_1.head(10)

Unnamed: 0,0,1,gs
0,"[other, sources, close, to, sale, said, Vivend...","[other, sources, close, to, sale, said, Vivend...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, s, numbers, also, marked, first, quar...",3.75
2,"[fines, are, part, failed, Republican, efforts...","[Perry, said, he, backs, Senate, s, efforts, i...",2.8
3,"[American, Anglican, Council, represents, Epis...","[American, Anglican, Council, represents, Epis...",3.4
4,"[tech-loaded, Nasdaq, composite, rose, 20.96, ...","[technology-laced, Nasdaq, Composite, Index, I...",2.4
5,"[Amgen, shares, gained, 93, cents, 1.45, perce...","[Shares, Allergan, were, up, 14, cents, 78.40,...",1.333
6,"[U.S, prosecutors, have, arrested, more, 130, ...","[More, 130, people, have, been, arrested, 17, ...",4.6
7,"[Chavez, said, investigators, feel, confident,...","[Albuquerque, Mayor, Martin, Chavez, said, inv...",3.8
8,"[Authorities, said, scientist, properly, quara...","[scientist, also, quarantined, himself, home, ...",4.2
9,"[support, will, come, free, software, upgrade,...","[upgrade, will, be, available, free, download,...",2.6


In [8]:
def n_gram_overlap(tokens1: List[str], tokens2: List[str], n: int) -> float:
    """
    Computes the n-gram overlap between two tokenized sentences.

    Parameters:
        tokens1 (List[str]): Tokenized first sentence as a list of strings.
        tokens2 (List[str]): Tokenized second sentence as a list of strings.
        n (int): The size of n-grams.

    Returns:
        float: The n-gram overlap ratio.
    """
    def generate_ngrams(tokens: List[str], n: int) -> Set[str]:
        """
        Generates n-grams for a given list of tokens.

        """
        return set([' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)])

    # Generate n-grams for both token lists
    ngrams_s1 = generate_ngrams(tokens1, n)
    ngrams_s2 = generate_ngrams(tokens2, n)

    # Compute the intersection 
    intersection = ngrams_s1.intersection(ngrams_s2)

    # Compute the n gram overlap when posible
    if len(intersection)==0:
        ngo=0
    else:
        ngo=2/((len(ngrams_s1)+len(ngrams_s2))/len(intersection))

    return float(ngo)


In [9]:
syntactic_features=pd.DataFrame(columns=['POS_tagging_unigrams','POS_tagging_bigrams','POS_tagging_trigrams'])

for i in range(n):
    # unigrams
    syntactic_features.loc[i,'POS_tagging_unigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],1)
    # bigrams
    syntactic_features.loc[i,'POS_tagging_bigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],2)
    # trigrams
    syntactic_features.loc[i,'POS_tagging_trigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],3)


# Convert all columns in a DataFrame to numeric, coercing errors into NaN.
syntactic_features['POS_tagging_unigrams'] = pd.to_numeric(syntactic_features['POS_tagging_unigrams'], errors='coerce') 
syntactic_features['POS_tagging_bigrams'] = pd.to_numeric(syntactic_features['POS_tagging_bigrams'], errors='coerce') 
syntactic_features['POS_tagging_trigrams'] = pd.to_numeric(syntactic_features['POS_tagging_trigrams'], errors='coerce') 

syntactic_features.head()
    

Unnamed: 0,POS_tagging_unigrams,POS_tagging_bigrams,POS_tagging_trigrams
0,0.702703,0.594595,0.514286
1,0.571429,0.421053,0.352941
2,0.5,0.25,0.090909
3,0.777778,0.764706,0.75
4,0.230769,0.0,0.0


In [10]:
def get_sentence_chunks(pos_tagged):
    """
    Extract chunks from a tokenized sentence using NLTK.
    
    """
    grammar = r"""
    # Verb phrase components
    p: 
        {<VBD><VBG>}              # Past progressive (e.g., "was eating")
        {<VBZ><VBG>}                    # Progressive form (e.g., "is eating")
        {<VBZ><VBN>}                    # Passive form (e.g., "is eaten")
        {<VBZ><JJ>}                     # Copular construction (e.g., "is happy")
        {<VBN>}                   # Perfect construction (e.g., "has driven")
        {<VBD><VBN>}              # Past perfect (e.g., "had driven")
        {<MD>?<VB.*><RB>*}              # Modal + verb + optional adverb
        {<VB.*><RP>?}                   # Verb with optional particle

    # Subject (typically occurs before VP)
    # Noun phrase components
    s:
        {<DT>?<JJ.*>*<NN.*>+}           # Basic noun phrase
        {<PRP>}                         # Pronouns
        {<NNP>+}                        # Proper nouns
        }<p>{
        

    # Object (must follow VP)
    o:
        }<p>{  
        {<DT>?<JJ.*>*<NN.*>+}           # Basic noun phrase
        {<PRP>}                         # Pronouns
        {<NNP>+}                        # Proper nouns
        {<IN><s>}                      # Prepositional object
        {<TO><s>}                      # 'To' prepositional phrase
    
    """
    
    # Create a chunk parser with our grammar
    chunk_parser = RegexpParser(grammar)

    # Perform chunking
    chunked = chunk_parser.parse(pos_tagged)
    
    # Extract chunks into a more readable format
    chunks = []
    for subtree in chunked.subtrees(filter=lambda t: t.label() != 'S'):
        words = [word for word, tag in subtree.leaves()]
        chunks.append((subtree.label(), words))
        #chunks.append(words)
        
    return chunks

In [11]:
# Generate the sets of chunks of the first and the second sentence

train_df_chunks = pd.DataFrame(columns=['0','1'])

for i in range(n):
    train_df_chunks.loc[i,'0'] = get_sentence_chunks(train_df_POS.loc[i,'0'])
    train_df_chunks.loc[i,'1'] = get_sentence_chunks(train_df_POS.loc[i,'1'])

train_df_chunks.head()

Unnamed: 0,0,1
0,"[(s, [other, sources]), (o, [to, the, sale]), ...","[(s, [other, sources]), (o, [to, the, sale]), ..."
1,"[(s, [Micron]), (p, [has, declared]), (s, [fir...","[(s, [Micron, s, numbers]), (p, [marked]), (s,..."
2,"[(s, [The, fines]), (p, [are]), (s, [part]), (...","[(s, [Perry]), (p, [said]), (s, [he]), (p, [ba..."
3,"[(s, [The, American, Anglican, Council]), (p, ...","[(s, [The, American, Anglican, Council]), (p, ..."
4,"[(s, [The, tech-loaded, Nasdaq, composite]), (...","[(s, [The, technology-laced, Nasdaq, Composite..."


In [69]:
# Wordnet tags
d = {'NN': 'n', 'NNS': 'n',
       'JJ': 'a', 'JJR': 'a', 'JJS': 'a',
       'VB': 'v', 'VBD': 'v', 'VBG': 'v', 'VBN': 'v', 'VBP': 'v', 'VBZ': 'v',
       'RB': 'r', 'RBR': 'r', 'RBS': 'r'}


# function to obtain a synset from a word.
def extract_synset(w):
  pair=nltk.pos_tag([w])
  if pair[0][1] in d.keys(): # Check if has a wordnet tag
    word_synsets = wn.synsets(w,d[pair[0][1]])
    return word_synsets[0]
  
  else:
    print('The word ',w,' has no wordnet tag.')
    return False


In [70]:
def chunksim(c1,c2):
    # compute lin similarity first
    sim_score=0
    for l1 in c1:
        for l2 in c2:
            synset_l1=extract_synset(l1)
            synset_l2=extract_synset(l2)
            print(synset_l1,synset_l2)
            if synset_l1.pos()==synset_l2.pos():

                # Calculate Lin Similarity
                sim_score += synset_l1.lin_similarity(synset_l2, brown_ic)

    return sim_score

In [None]:
predicates_0=[]

for tupla in train_df_chunks.loc[0,'0']:
    if tupla[0]=='p':
        predicates_0.append(tupla[1])


predicates_1=[]

for tupla in train_df_chunks.loc[0,'1']:
    if tupla[0]=='p':
        predicates_1.append(tupla[1])

len0=len(predicates_0)
len1=len(predicates_1)

linsim_ = np.zeros((len0,len1))
for i in range(len0):
    for j in range(len1):
        linsim_[i,j] = chunksim(predicates_0[i],predicates_1[j])

if len0 > len1:
    max_val=np.zeros(len1)
    # si el numero de filas es mayor que el numero de columnas, tomo el maximo de cada columna
    for j in range():
        # Encuentra el valor máximo en la columna j
        max_val[j] = np.max(linsim_[:, j])


        # Encuentra la fila donde ocurre el valor máximo
        i = np.argmax(linsim_[:, j])
else:
    #tomo el maximo de cada fila
    max_val=np.zeros(len0)
    # si el numero de filas es mayor que el numero de columnas, tomo el maximo de cada columna
    for i in range():
        # Encuentra el valor máximo en la columna j
        max_val[i] = np.max(linsim_[i, :])


        # Encuentra la fila donde ocurre el valor máximo
        j = np.argmax(linsim_[i, :])

linsim_



Synset('state.v.01') Synset('state.v.01')
Synset('state.v.01') Synset('be.v.01')
Synset('state.v.01') Synset('keep.v.01')
Synset('be.v.01') Synset('state.v.01')
Synset('keep.v.01') Synset('state.v.01')
Synset('be.v.01') Synset('be.v.01')
Synset('be.v.01') Synset('keep.v.01')
Synset('keep.v.01') Synset('be.v.01')
Synset('keep.v.01') Synset('keep.v.01')
Synset('hope.v.01') Synset('state.v.01')
Synset('hope.v.01') Synset('be.v.01')
Synset('hope.v.01') Synset('keep.v.01')
Synset('see.v.01') Synset('state.v.01')
Synset('see.v.01') Synset('be.v.01')
Synset('see.v.01') Synset('keep.v.01')
Synset('team.n.01') Synset('state.v.01')
Synset('team.n.01') Synset('be.v.01')
Synset('team.n.01') Synset('keep.v.01')


array([[1., 0.],
       [0., 2.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])