In [23]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

# Ensure necessary resources are downloaded
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet_ic', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [24]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Load the training dataset
train_df = preprocessor.load_dataset('../Preprocessing/STS_train.csv')

# Normalize the text
normal_train_df = preprocessor.remove_punctuation(train_df)
normal_train_df = preprocessor.convert_to_lowercase(normal_train_df)
normal_train_df = preprocessor.remove_empty_strings(normal_train_df)

# Create 2 separate DataFrames, one without stopwords and the other also lemmatized
sw_train_df = preprocessor.remove_stopwords(normal_train_df)
lemmas_train_df = preprocessor.lemmatize(sw_train_df)

In [25]:
# Group the 3 DataFrames into a Dictionary, with their "names"
train_dfs = {'normal': normal_train_df, 'sw': sw_train_df, 'lemmas': lemmas_train_df}

# Create the features DataFrame
train_features_df = pd.DataFrame()

In [26]:
# Greedy String Tiling (GST)

# Apply Greedy String Tiling to find maximal matching substrings (tiles) between two tokenized sentences.
def greedy_string_tiling(sentence1_tokens, sentence2_tokens, min_match_length=2):
    # Convert tokenized sentences to full sentences
    sentence1 = " ".join(sentence1_tokens)
    sentence2 = " ".join(sentence2_tokens)

    matched_indices1 = set()
    matched_indices2 = set()
    tile_lengths = []

    while True:
        max_tile = None
        max_length = 0

        # Find the longest match not covered by existing tiles
        for i in range(len(sentence1)):
            for j in range(len(sentence2)):
                length = 0
                while (
                    i + length < len(sentence1) and
                    j + length < len(sentence2) and
                    sentence1[i + length] == sentence2[j + length] and
                    (i + length not in matched_indices1) and
                    (j + length not in matched_indices2)
                ):
                    length += 1

                if length >= min_match_length and length > max_length:
                    max_tile = (i, j, length)
                    max_length = length

        # If no valid tile is found, stop
        if not max_tile:
            break

        # Mark the matched indices as covered
        start1, start2, length = max_tile
        for k in range(length):
            matched_indices1.add(start1 + k)
            matched_indices2.add(start2 + k)

        tile_lengths.append(max_tile[2])

    # Aggregate all tile lengths by summing and normalizing by sentence length
    return np.sum(tile_lengths) / max(len(sentence1), len(sentence2))

gst_min_lengths = [5, 10]

def compute_greedy_string_tiling(dfs, features_df):
    for min_match_length in gst_min_lengths:
        for name, df in dfs.items():
            with ThreadPoolExecutor() as executor:
                results = list(tqdm(
                    executor.map(
                        lambda sentence_pair: greedy_string_tiling(sentence_pair['0'], sentence_pair['1'], min_match_length),
                        (sentence_pair for _, sentence_pair in df.iterrows())
                    ),
                    total=len(df),
                    desc=f"Computing Greedy String Tiling {name}, {min_match_length}"
                ))
            features_df[f'{name}_gst_{min_match_length}'] = results

In [27]:
compute_greedy_string_tiling(train_dfs, train_features_df)

Computing Greedy String Tiling normal, 5: 100%|██████████| 2234/2234 [01:25<00:00, 26.05it/s] 
Computing Greedy String Tiling sw, 5: 100%|██████████| 2234/2234 [00:21<00:00, 103.00it/s]
Computing Greedy String Tiling lemmas, 5: 100%|██████████| 2234/2234 [00:25<00:00, 87.20it/s] 
Computing Greedy String Tiling normal, 10: 100%|██████████| 2234/2234 [00:49<00:00, 44.73it/s] 
Computing Greedy String Tiling sw, 10: 100%|██████████| 2234/2234 [00:36<00:00, 60.74it/s] 
Computing Greedy String Tiling lemmas, 10: 100%|██████████| 2234/2234 [00:16<00:00, 136.67it/s]


In [28]:
# Character n-grams Cosine Similarity

# Generate character n-grams for a tokenized sentence.
def char_ngrams(sent_tokens, n):
    sentence = ' '.join(sent_tokens)  # Join tokens into a single string
    return [''.join(sentence[i:i+n]) for i in range(len(sentence) - n + 1)]

# Compute the cosine similarity between two tokenized sentences based on character n-grams.
def character_ngram_similarity(sent1_tokens, sent2_tokens, n=3):
    # Generate character n-grams for both sentences
    sent1_ngrams = char_ngrams(sent1_tokens, n)
    sent2_ngrams = char_ngrams(sent2_tokens, n)

    # Combine n-grams into a single string per sentence
    sent1_ngram_str = ' '.join(sent1_ngrams)
    sent2_ngram_str = ' '.join(sent2_ngrams)

    # Initialize TfidfVectorizer
    vectorizer = TfidfVectorizer()

    # Convert the n-gram strings into TF-IDF representations
    tfidf_matrix = vectorizer.fit_transform([sent1_ngram_str, sent2_ngram_str])

    # Compute cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

    return similarity[0][0]

ngram_lengths = [2, 3, 4, 5]

def compute_character_ngram_similarity(dfs, features_df):
    for n in ngram_lengths:
        for name, df in dfs.items():
            with ThreadPoolExecutor() as executor:
                results = list(tqdm(
                    executor.map(
                        lambda sentence_pair: character_ngram_similarity(sentence_pair['0'], sentence_pair['1'], n),
                        (sentence_pair for _, sentence_pair in df.iterrows())
                    ),
                    total=len(df),
                    desc=f"Computing Character n-gram Similarity {name}, {n}"
                ))
            features_df[f'{name}_char_{n}gram'] = results

In [29]:
compute_character_ngram_similarity(train_dfs, train_features_df)

Computing Character n-gram Similarity normal, 2: 100%|██████████| 2234/2234 [00:03<00:00, 652.44it/s] 
Computing Character n-gram Similarity sw, 2: 100%|██████████| 2234/2234 [00:03<00:00, 571.43it/s]
Computing Character n-gram Similarity lemmas, 2: 100%|██████████| 2234/2234 [00:02<00:00, 838.05it/s]
Computing Character n-gram Similarity normal, 3: 100%|██████████| 2234/2234 [00:03<00:00, 570.53it/s] 
Computing Character n-gram Similarity sw, 3: 100%|██████████| 2234/2234 [00:04<00:00, 552.44it/s] 
Computing Character n-gram Similarity lemmas, 3: 100%|██████████| 2234/2234 [00:03<00:00, 625.02it/s]
Computing Character n-gram Similarity normal, 4: 100%|██████████| 2234/2234 [00:04<00:00, 513.35it/s]
Computing Character n-gram Similarity sw, 4: 100%|██████████| 2234/2234 [00:04<00:00, 553.82it/s] 
Computing Character n-gram Similarity lemmas, 4: 100%|██████████| 2234/2234 [00:03<00:00, 609.62it/s] 
Computing Character n-gram Similarity normal, 5: 100%|██████████| 2234/2234 [00:04<00:00,

In [30]:
# Save into a CSV with the Normalized Gold Standard
train_features_df['gs'] = train_df['gs'] / 5.0

train_features_df.to_csv('train/stringFeatures_train.csv', index=False)

In [31]:
# Apply the same pipeline to the testing Dataset

# Load the testing dataset
test_df = preprocessor.load_dataset('../Preprocessing/STS_test.csv')

# Normalize the text
normal_test_df = preprocessor.remove_punctuation(test_df)
normal_test_df = preprocessor.convert_to_lowercase(normal_test_df)
normal_test_df = preprocessor.remove_empty_strings(normal_test_df)

# Create 2 separate DataFrames, one without stopwords and the other also lemmatized
sw_test_df = preprocessor.remove_stopwords(normal_test_df)
lemmas_test_df = preprocessor.lemmatize(sw_test_df)

# Group the 3 DataFrames into a Dictionary, with their "names"
test_dfs = {'normal': normal_test_df, 'sw': sw_test_df, 'lemmas': lemmas_test_df}

# Create the features DataFrame
test_features_df = pd.DataFrame()

# Greedy String Tiling
compute_greedy_string_tiling(test_dfs, test_features_df)

# Character n-gram Cosine Similarity
compute_character_ngram_similarity(test_dfs, test_features_df)

Computing Greedy String Tiling normal, 5: 100%|██████████| 3108/3108 [00:13<00:00, 238.43it/s]
Computing Greedy String Tiling sw, 5: 100%|██████████| 3108/3108 [00:06<00:00, 476.63it/s]
Computing Greedy String Tiling lemmas, 5: 100%|██████████| 3108/3108 [00:08<00:00, 362.67it/s]
Computing Greedy String Tiling normal, 10: 100%|██████████| 3108/3108 [00:28<00:00, 107.50it/s]
Computing Greedy String Tiling sw, 10: 100%|██████████| 3108/3108 [00:01<00:00, 1689.81it/s]
Computing Greedy String Tiling lemmas, 10: 100%|██████████| 3108/3108 [00:01<00:00, 2396.25it/s]
Computing Character n-gram Similarity normal, 2: 100%|██████████| 3108/3108 [00:04<00:00, 621.89it/s]
Computing Character n-gram Similarity sw, 2: 100%|██████████| 3108/3108 [00:02<00:00, 1337.53it/s] 
Computing Character n-gram Similarity lemmas, 2: 100%|██████████| 3108/3108 [00:03<00:00, 874.14it/s] 
Computing Character n-gram Similarity normal, 3: 100%|██████████| 3108/3108 [00:04<00:00, 674.87it/s]
Computing Character n-gram

In [32]:
# Save into a CSV with the Normalized Gold Standard
test_features_df['gs'] = test_df['gs'] / 5.0

test_features_df.to_csv('test/stringFeatures_test.csv', index=False)

# Pearson Correlations

In [33]:
from scipy.stats import pearsonr

correlations = {}
for column in train_features_df.columns:
    corr, _ = pearsonr(train_features_df[column], train_df['gs'])
    correlations[column] = corr

# Convert the dictionary to a DataFrame for tabular representation
correlation_table = pd.DataFrame(list(correlations.items()), columns=['Variable', 'Correlation'])

print(correlation_table)

             Variable  Correlation
0        normal_gst_5     0.571769
1            sw_gst_5     0.666634
2        lemmas_gst_5     0.666148
3       normal_gst_10     0.526147
4           sw_gst_10     0.560675
5       lemmas_gst_10     0.560554
6   normal_char_2gram     0.721553
7       sw_char_2gram     0.687611
8   lemmas_char_2gram     0.690158
9   normal_char_3gram     0.651404
10      sw_char_3gram     0.643363
11  lemmas_char_3gram     0.647888
12  normal_char_4gram     0.577053
13      sw_char_4gram     0.635147
14  lemmas_char_4gram     0.642041
15  normal_char_5gram     0.503058
16      sw_char_5gram     0.625407
17  lemmas_char_5gram     0.634286
18                 gs     1.000000


In [34]:
from scipy.stats import pearsonr

correlations = {}
for column in test_features_df.columns:
    corr, _ = pearsonr(test_features_df[column], test_df['gs'])
    correlations[column] = corr

# Convert the dictionary to a DataFrame for tabular representation
correlation_table = pd.DataFrame(list(correlations.items()), columns=['Variable', 'Correlation'])

print(correlation_table)

             Variable  Correlation
0        normal_gst_5     0.557095
1            sw_gst_5     0.627919
2        lemmas_gst_5     0.623206
3       normal_gst_10     0.460836
4           sw_gst_10     0.460177
5       lemmas_gst_10     0.460322
6   normal_char_2gram     0.618557
7       sw_char_2gram     0.605632
8   lemmas_char_2gram     0.601264
9   normal_char_3gram     0.561061
10      sw_char_3gram     0.590396
11  lemmas_char_3gram     0.586472
12  normal_char_4gram     0.490803
13      sw_char_4gram     0.580157
14  lemmas_char_4gram     0.578780
15  normal_char_5gram     0.423879
16      sw_char_5gram     0.565421
17  lemmas_char_5gram     0.565477
18                 gs     1.000000
