In [130]:
import pandas as pd
import nltk
import ast
import sys
from pathlib import Path
from nltk.metrics import jaccard_distance
from nltk.util import ngrams
from typing import List, Set
from scipy.stats import pearsonr
import numpy as np

# dir_nltk_data = 'c:\\Users\\maric\\Documents\\MASTERS\\PrimerSemestre\\IHLT\\FinalProject\\IHLT\\Lib\\site-packages\\nltk\\tag'

# Download required resource
nltk.download('averaged_perceptron_tagger_eng')# , download_dir=dir_nltk_data)

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\maric\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [114]:
# Load the training dataset
train_df = pd.read_csv('../Preprocessing/STS_train.csv')

# Display the DataFrame
train_df.head()

Unnamed: 0,0,1,gs
0,"['But', 'other', 'sources', 'close', 'to', 'th...","['But', 'other', 'sources', 'close', 'to', 'th...",4.0
1,"['Micron', 'has', 'declared', 'its', 'first', ...","['Micron', ""'s"", 'numbers', 'also', 'marked', ...",3.75
2,"['The', 'fines', 'are', 'part', 'of', 'failed'...","['Perry', 'said', 'he', 'backs', 'the', 'Senat...",2.8
3,"['The', 'American', 'Anglican', 'Council', ','...","['The', 'American', 'Anglican', 'Council', ','...",3.4
4,"['The', 'tech-loaded', 'Nasdaq', 'composite', ...","['The', 'technology-laced', 'Nasdaq', 'Composi...",2.4


In [115]:
# Turn the 2 first columns from strings to actual lists of strings
train_df.iloc[:, :2] = train_df.iloc[:, :2].map(ast.literal_eval)

train_df.head()

Unnamed: 0,0,1,gs
0,"[But, other, sources, close, to, the, sale, sa...","[But, other, sources, close, to, the, sale, sa...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, 's, numbers, also, marked, the, first...",3.75
2,"[The, fines, are, part, of, failed, Republican...","[Perry, said, he, backs, the, Senate, 's, effo...",2.8
3,"[The, American, Anglican, Council, ,, which, r...","[The, American, Anglican, Council, ,, which, r...",3.4
4,"[The, tech-loaded, Nasdaq, composite, rose, 20...","[The, technology-laced, Nasdaq, Composite, Ind...",2.4


In [116]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Remove punctuation
train_df = preprocessor.remove_punctuation(train_df)
train_df = preprocessor.remove_empty_strings(train_df)

# POS-tagging the words

n=len(train_df)
train_df_POS = pd.DataFrame(columns=['0','1'])

function_words_tag = {'IN', 'CC', 'DT', 'PDT', 'WDT'}

for i in range(n):
    train_df_POS.loc[i,'0'] = nltk.pos_tag(train_df.loc[i,'0']) 
    train_df_POS.loc[i,'1'] = nltk.pos_tag(train_df.loc[i,'1']) 

train_df_POS.head()


Unnamed: 0,0,1
0,"[(But, CC), (other, JJ), (sources, NNS), (clos...","[(But, CC), (other, JJ), (sources, NNS), (clos..."
1,"[(Micron, NNP), (has, VBZ), (declared, VBN), (...","[(Micron, NNP), (s, NN), (numbers, NNS), (also..."
2,"[(The, DT), (fines, NNS), (are, VBP), (part, N...","[(Perry, NNP), (said, VBD), (he, PRP), (backs,..."
3,"[(The, DT), (American, JJ), (Anglican, NNP), (...","[(The, DT), (American, JJ), (Anglican, NNP), (..."
4,"[(The, DT), (tech-loaded, JJ), (Nasdaq, NNP), ...","[(The, DT), (technology-laced, JJ), (Nasdaq, N..."


In [117]:
# the function words (prepositions, conjunctions, articles) carry less semantics than content words and thus removing them might eliminate
# the noise and provide a more accurate estimate of semantic similarity.

function_words_tag = {'IN', 'CC', 'DT', 'PDT', 'WDT'}

for i in range(n):
    for tag in function_words_tag:
        j=0
        while j < (len(train_df_POS.loc[i,'0'])):
            if train_df_POS.loc[i,'0'][j][1]==tag:
                train_df_POS.loc[i,'0'].pop(j)
            j=j+1
        j=0
        while j < (len(train_df_POS.loc[i,'1'])):
            if train_df_POS.loc[i,'1'][j][1]==tag:
                train_df_POS.loc[i,'1'].pop(j)
            j=j+1
            
train_df_POS.head()

Unnamed: 0,0,1
0,"[(other, JJ), (sources, NNS), (close, RB), (to...","[(other, JJ), (sources, NNS), (close, RB), (to..."
1,"[(Micron, NNP), (has, VBZ), (declared, VBN), (...","[(Micron, NNP), (s, NN), (numbers, NNS), (also..."
2,"[(fines, NNS), (are, VBP), (part, NN), (failed...","[(Perry, NNP), (said, VBD), (he, PRP), (backs,..."
3,"[(American, JJ), (Anglican, NNP), (Council, NN...","[(American, JJ), (Anglican, NNP), (Council, NN..."
4,"[(tech-loaded, JJ), (Nasdaq, NNP), (composite,...","[(technology-laced, JJ), (Nasdaq, NNP), (Compo..."


In [118]:
train_df_1 = pd.DataFrame(columns=['0','1','gs'])

for i in range(n):
    sentence=[]
    for j in range(len(train_df_POS.loc[i,'0'])):
        sentence.append(train_df_POS.loc[i,'0'][j][0])
    train_df_1.loc[i,'0']=sentence
    sentence=[]
    for k in range(len(train_df_POS.loc[i,'1'])):
        sentence.append(train_df_POS.loc[i,'1'][k][0])
    train_df_1.loc[i,'1']=sentence

train_df_1['gs'] = train_df['gs']
train_df_1.head()

Unnamed: 0,0,1,gs
0,"[other, sources, close, to, sale, said, Vivend...","[other, sources, close, to, sale, said, Vivend...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, s, numbers, also, marked, first, quar...",3.75
2,"[fines, are, part, failed, Republican, efforts...","[Perry, said, he, backs, Senate, s, efforts, i...",2.8
3,"[American, Anglican, Council, represents, Epis...","[American, Anglican, Council, represents, Epis...",3.4
4,"[tech-loaded, Nasdaq, composite, rose, 20.96, ...","[technology-laced, Nasdaq, Composite, Index, I...",2.4


In [141]:
def n_gram_overlap(tokens1: List[str], tokens2: List[str], n: int) -> float:
    """
    Computes the n-gram overlap between two tokenized sentences.

    Parameters:
        tokens1 (List[str]): Tokenized first sentence as a list of strings.
        tokens2 (List[str]): Tokenized second sentence as a list of strings.
        n (int): The size of n-grams.

    Returns:
        float: The n-gram overlap ratio.
    """
    def generate_ngrams(tokens: List[str], n: int) -> Set[str]:
        """
        Generates n-grams for a given list of tokens.

        Parameters:
            tokens (List[str]): The input tokens.
            n (int): The size of n-grams.

        Returns:
            Set[str]: A set of n-grams.
        """
        return set([' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)])

    # Generate n-grams for both token lists
    ngrams_s1 = generate_ngrams(tokens1, n)
    ngrams_s2 = generate_ngrams(tokens2, n)

    # Compute the intersection and union
    intersection = ngrams_s1.intersection(ngrams_s2)
    union = ngrams_s1.union(ngrams_s2)

    # Calculate the overlap ratio
    overlap_ratio = len(intersection) / len(union) if union else 0.0

    return float(overlap_ratio)


In [165]:
syntactic_features=pd.DataFrame(columns=['POS_tagging_unigrams','POS_tagging_bigrams','POS_tagging_trigrams'])

In [179]:
for i in range(n):
    # unigrams
    syntactic_features.loc[i,'POS_tagging_unigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],1)
    # bigrams
    syntactic_features.loc[i,'POS_tagging_bigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],2)
    # trigrams
    syntactic_features.loc[i,'POS_tagging_trigrams'] = n_gram_overlap(train_df_1.loc[i,'0'],train_df_1.loc[i,'1'],3)

# Convert all columns in a DataFrame to numeric, coercing errors into NaN.
syntactic_features.apply(pd.to_numeric, errors='coerce')

syntactic_features.head()
    

Unnamed: 0,POS_tagging_unigrams,POS_tagging_bigrams,POS_tagging_trigrams
0,0.541667,0.423077,0.346154
1,0.4,0.266667,0.214286
2,0.333333,0.142857,0.047619
3,0.608696,0.590909,0.571429
4,0.130435,0.0,0.0


In [181]:
pearsonr(syntactic_features['POS_tagging_unigrams'], train_df['gs'])[0]

np.float64(0.4858577099663492)