In [None]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.metrics import jaccard_distance
from nltk.util import ngrams
from nltk.wsd import lesk
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
from itertools import chain
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

# Ensure necessary resources are downloaded
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet_ic', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the Information Content (IC) corpus
ic = wordnet_ic.ic('ic-brown.dat')

In [2]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Load the training dataset
train_df = preprocessor.load_dataset('../Preprocessing/STS_train.csv')

# Normalize the text
normal_train_df = preprocessor.remove_punctuation(train_df)
normal_train_df = preprocessor.convert_to_lowercase(normal_train_df)
normal_train_df = preprocessor.remove_empty_strings(normal_train_df)

# Create 2 separate DataFrames, one without stopwords and the other also lemmatized
sw_train_df = preprocessor.remove_stopwords(normal_train_df)
lemmas_train_df = preprocessor.lemmatize(sw_train_df)

In [3]:
# Group the 3 DataFrames into a Dictionary, with their "names"
train_dfs = {'normal': normal_train_df, 'sw': sw_train_df, 'lemmas': lemmas_train_df}

# Create the features DataFrame
train_features_df = pd.DataFrame()

In [4]:
# Greedy String Tiling (GST)

# Apply Greedy String Tiling to find maximal matching substrings (tiles) between two tokenized sentences.
def greedy_string_tiling(sentence1_tokens, sentence2_tokens, min_match_length=2):
    # Convert tokenized sentences to full sentences
    sentence1 = " ".join(sentence1_tokens)
    sentence2 = " ".join(sentence2_tokens)

    matched_indices1 = set()
    matched_indices2 = set()
    tile_lengths = []

    while True:
        max_tile = None
        max_length = 0

        # Find the longest match not covered by existing tiles
        for i in range(len(sentence1)):
            for j in range(len(sentence2)):
                length = 0
                while (
                    i + length < len(sentence1) and
                    j + length < len(sentence2) and
                    sentence1[i + length] == sentence2[j + length] and
                    (i + length not in matched_indices1) and
                    (j + length not in matched_indices2)
                ):
                    length += 1

                if length >= min_match_length and length > max_length:
                    max_tile = (i, j, length)
                    max_length = length

        # If no valid tile is found, stop
        if not max_tile:
            break

        # Mark the matched indices as covered
        start1, start2, length = max_tile
        for k in range(length):
            matched_indices1.add(start1 + k)
            matched_indices2.add(start2 + k)

        tile_lengths.append(max_tile[2])

    # Aggregate all tile lengths by summing and normalizing by sentence length
    return np.sum(tile_lengths) / max(len(sentence1), len(sentence2))

gst_min_lengths = [5, 10]

def compute_greedy_string_tiling(dfs, features_df):
    for min_match_length in gst_min_lengths:
        for name, df in dfs.items():
            with ThreadPoolExecutor() as executor:
                results = list(tqdm(
                    executor.map(
                        lambda sentence_pair: greedy_string_tiling(sentence_pair['0'], sentence_pair['1'], min_match_length),
                        (sentence_pair for _, sentence_pair in df.iterrows())
                    ),
                    total=len(df),
                    desc=f"Computing Greedy String Tiling {name}, {min_match_length}"
                ))
            features_df[f'{name}_gst_{min_match_length}'] = results

In [5]:
compute_greedy_string_tiling(train_dfs, train_features_df)

Computing Greedy String Tiling normal, 5: 100%|██████████| 2234/2234 [01:25<00:00, 26.24it/s] 
Computing Greedy String Tiling sw, 5: 100%|██████████| 2234/2234 [00:27<00:00, 80.74it/s]
Computing Greedy String Tiling lemmas, 5: 100%|██████████| 2234/2234 [00:27<00:00, 81.54it/s]
Computing Greedy String Tiling normal, 10: 100%|██████████| 2234/2234 [00:53<00:00, 41.46it/s] 
Computing Greedy String Tiling sw, 10: 100%|██████████| 2234/2234 [00:18<00:00, 122.12it/s]
Computing Greedy String Tiling lemmas, 10: 100%|██████████| 2234/2234 [00:15<00:00, 140.21it/s]


In [9]:
# Character n-grams Cosine Similarity

# Generate character n-grams for a tokenized sentence.
def char_ngrams(sent_tokens, n):
    sentence = ' '.join(sent_tokens)  # Join tokens into a single string
    return [''.join(sentence[i:i+n]) for i in range(len(sentence) - n + 1)]

# Compute the cosine similarity between two tokenized sentences based on character n-grams.
def character_ngram_similarity(sent1_tokens, sent2_tokens, n=3):
    # Generate character n-grams for both sentences
    sent1_ngrams = char_ngrams(sent1_tokens, n)
    sent2_ngrams = char_ngrams(sent2_tokens, n)

    # Combine n-grams into a single string per sentence
    sent1_ngram_str = ' '.join(sent1_ngrams)
    sent2_ngram_str = ' '.join(sent2_ngrams)

    # Initialize TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Convert the n-gram strings into TF-IDF representations
    tfidf_matrix = vectorizer.fit_transform([sent1_ngram_str, sent2_ngram_str])

    # Compute cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    return similarity[0][0]

ngram_lengths = [2, 3, 4]

def compute_character_ngram_similarity(dfs, features_df):
    for n in ngram_lengths:
        for name, df in dfs.items():
            with ThreadPoolExecutor() as executor:
                results = list(tqdm(
                    executor.map(
                        lambda sentence_pair: character_ngram_similarity(sentence_pair['0'], sentence_pair['1'], n),
                        (sentence_pair for _, sentence_pair in df.iterrows())
                    ),
                    total=len(df),
                    desc=f"Computing Character n-gram Similarity {name}, {n}"
                ))
            features_df[f'{name}_gst_{n}'] = results

In [10]:
compute_character_ngram_similarity(train_dfs, train_features_df)

Computing Character n-gram Similarity normal, 2: 100%|██████████| 2234/2234 [00:04<00:00, 542.75it/s]
Computing Character n-gram Similarity sw, 2: 100%|██████████| 2234/2234 [00:03<00:00, 610.22it/s]
Computing Character n-gram Similarity lemmas, 2: 100%|██████████| 2234/2234 [00:03<00:00, 688.30it/s]
Computing Character n-gram Similarity normal, 3: 100%|██████████| 2234/2234 [00:04<00:00, 470.02it/s]
Computing Character n-gram Similarity sw, 3: 100%|██████████| 2234/2234 [00:04<00:00, 452.72it/s]
Computing Character n-gram Similarity lemmas, 3: 100%|██████████| 2234/2234 [00:04<00:00, 542.68it/s]
Computing Character n-gram Similarity normal, 4: 100%|██████████| 2234/2234 [00:04<00:00, 500.17it/s] 
Computing Character n-gram Similarity sw, 4: 100%|██████████| 2234/2234 [00:03<00:00, 579.86it/s]
Computing Character n-gram Similarity lemmas, 4: 100%|██████████| 2234/2234 [00:03<00:00, 574.02it/s]


In [15]:
# Save into a CSV with the Normalized Gold Standard
train_features_df['gs'] = train_df['gs'] / 5.0

train_features_df.to_csv('train/stringFeatures_train.csv', index=False)

In [13]:
# Apply the same pipeline to the testing Dataset

# Load the testing dataset
test_df = preprocessor.load_dataset('../Preprocessing/STS_test.csv')

# Normalize the text
normal_test_df = preprocessor.remove_punctuation(test_df)
normal_test_df = preprocessor.convert_to_lowercase(normal_test_df)
normal_test_df = preprocessor.remove_empty_strings(normal_test_df)

# Create 2 separate DataFrames, one without stopwords and the other also lemmatized
sw_test_df = preprocessor.remove_stopwords(normal_test_df)
lemmas_test_df = preprocessor.lemmatize(sw_test_df)

# Group the 3 DataFrames into a Dictionary, with their "names"
test_dfs = {'normal': normal_test_df, 'sw': sw_test_df, 'lemmas': lemmas_test_df}

# Create the features DataFrame
test_features_df = pd.DataFrame()

# Greedy String Tiling
compute_greedy_string_tiling(test_dfs, test_features_df)

# Character n-gram Cosine Similarity
compute_character_ngram_similarity(test_dfs, test_features_df)

Computing Greedy String Tiling normal, 5: 100%|██████████| 3108/3108 [00:16<00:00, 188.07it/s]
Computing Greedy String Tiling sw, 5: 100%|██████████| 3108/3108 [00:06<00:00, 466.16it/s]
Computing Greedy String Tiling lemmas, 5: 100%|██████████| 3108/3108 [00:05<00:00, 617.52it/s]
Computing Greedy String Tiling normal, 10: 100%|██████████| 3108/3108 [00:04<00:00, 720.93it/s]
Computing Greedy String Tiling sw, 10: 100%|██████████| 3108/3108 [00:02<00:00, 1052.39it/s]
Computing Greedy String Tiling lemmas, 10: 100%|██████████| 3108/3108 [00:03<00:00, 1026.20it/s]
Computing Character n-gram Similarity normal, 2: 100%|██████████| 3108/3108 [00:04<00:00, 624.94it/s]
Computing Character n-gram Similarity sw, 2: 100%|██████████| 3108/3108 [00:05<00:00, 568.13it/s]
Computing Character n-gram Similarity lemmas, 2: 100%|██████████| 3108/3108 [00:05<00:00, 586.87it/s]
Computing Character n-gram Similarity normal, 3: 100%|██████████| 3108/3108 [00:06<00:00, 506.98it/s]
Computing Character n-gram Si

In [16]:
# Save into a CSV with the Normalized Gold Standard
test_features_df['gs'] = test_df['gs'] / 5.0

test_features_df.to_csv('test/stringFeatures_test.csv', index=False)

# Pearson Correlations

In [11]:
from scipy.stats import pearsonr

correlations = {}
for column in train_features_df.columns:
    corr, _ = pearsonr(train_features_df[column], train_df['gs'])
    correlations[column] = corr

# Convert the dictionary to a DataFrame for tabular representation
correlation_table = pd.DataFrame(list(correlations.items()), columns=['Variable', 'Correlation'])

print(correlation_table)

         Variable  Correlation
0    normal_gst_5     0.571769
1        sw_gst_5     0.666634
2    lemmas_gst_5     0.666148
3   normal_gst_10     0.526147
4       sw_gst_10     0.560675
5   lemmas_gst_10     0.560554
6    normal_gst_2     0.721553
7        sw_gst_2     0.687611
8    lemmas_gst_2     0.690158
9    normal_gst_3     0.651404
10       sw_gst_3     0.643363
11   lemmas_gst_3     0.647888
12   normal_gst_4     0.577053
13       sw_gst_4     0.635147
14   lemmas_gst_4     0.642041
