In [93]:
import pandas as pd
import nltk
import ast
import sys
from pathlib import Path
from nltk.metrics import jaccard_distance
from nltk.util import ngrams

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

In [71]:
# Load the training dataset
train_df = pd.read_csv('../Preprocessing/STS_train.csv')

# Display the DataFrame
train_df.head()

Unnamed: 0,0,1,gs
0,"['But', 'other', 'sources', 'close', 'to', 'th...","['But', 'other', 'sources', 'close', 'to', 'th...",4.0
1,"['Micron', 'has', 'declared', 'its', 'first', ...","['Micron', ""'s"", 'numbers', 'also', 'marked', ...",3.75
2,"['The', 'fines', 'are', 'part', 'of', 'failed'...","['Perry', 'said', 'he', 'backs', 'the', 'Senat...",2.8
3,"['The', 'American', 'Anglican', 'Council', ','...","['The', 'American', 'Anglican', 'Council', ','...",3.4
4,"['The', 'tech-loaded', 'Nasdaq', 'composite', ...","['The', 'technology-laced', 'Nasdaq', 'Composi...",2.4


In [72]:
# Turn the 2 first columns from strings to actual lists of strings
train_df.iloc[:, :2] = train_df.iloc[:, :2].map(ast.literal_eval)

train_df.head()

Unnamed: 0,0,1,gs
0,"[But, other, sources, close, to, the, sale, sa...","[But, other, sources, close, to, the, sale, sa...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, 's, numbers, also, marked, the, first...",3.75
2,"[The, fines, are, part, of, failed, Republican...","[Perry, said, he, backs, the, Senate, 's, effo...",2.8
3,"[The, American, Anglican, Council, ,, which, r...","[The, American, Anglican, Council, ,, which, r...",3.4
4,"[The, tech-loaded, Nasdaq, composite, rose, 20...","[The, technology-laced, Nasdaq, Composite, Ind...",2.4


In [73]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Normalize the text
normal_train_df = preprocessor.remove_punctuation(train_df)
normal_train_df = preprocessor.convert_to_lowercase(normal_train_df)
normal_train_df = preprocessor.remove_empty_strings(normal_train_df)

# Create 2 separate DataFrames, one without stopwords and the other also lemmatized
sw_train_df = preprocessor.remove_stopwords(normal_train_df)
lemmas_train_df = preprocessor.lemmatize(sw_train_df)

lemmas_train_df.head()

Unnamed: 0,0,1,gs
0,"[source, close, sale, said, vivendi, keeping, ...","[source, close, sale, said, vivendi, keeping, ...",4.0
1,"[micron, declared, first, quarterly, profit, t...","[micron, number, also, marked, first, quarterl...",3.75
2,"[fine, part, failed, republican, effort, force...","[perry, said, back, senate, effort, including,...",2.8
3,"[american, anglican, council, represents, epis...","[american, anglican, council, represents, epis...",3.4
4,"[tech-loaded, nasdaq, composite, rose, 20.96, ...","[technology-laced, nasdaq, composite, index, i...",2.4


In [None]:
# Group the 3 DataFrames into a Dictionary, with their "names"
train_dfs = {'normal': normal_train_df, 'sw': sw_train_df, 'lemmas': lemmas_train_df}

In [110]:
# Create the lexical features DataFrame
lexical_features_df = pd.DataFrame()

# The first 3 features are the jaccard similarity between the sentence pairs.
for name, df in train_dfs.items():
    lexical_features_df[f'{name}_jaccard'] = [1 - jaccard_distance(set(sentence_pair['0']), set(sentence_pair['1'])) for _, sentence_pair in df.iterrows()]

lexical_features_df.head()

Unnamed: 0,normal_jaccard,sw_jaccard,lemmas_jaccard
0,0.533333,0.473684,0.473684
1,0.388889,0.5,0.5
2,0.333333,0.357143,0.357143
3,0.607143,0.611111,0.611111
4,0.192308,0.15,0.15


In [111]:
# Apart from the Jaccard distance, we also measure the containment measure (Broder, 1997)
def containment_measure(set_a, set_b):
    # Calculate the intersection of both sets
    intersection = set_a.intersection(set_b)
    
    # Return the containment measure
    return len(intersection) / min(len(set_a), len(set_b))

for name, df in train_dfs.items():
    lexical_features_df[f'{name}_containment'] = [containment_measure(set(sentence_pair['0']), set(sentence_pair['1'])) for _, sentence_pair in df.iterrows()]

lexical_features_df.head()

Unnamed: 0,normal_jaccard,sw_jaccard,lemmas_jaccard,normal_containment,sw_containment,lemmas_containment
0,0.533333,0.473684,0.473684,0.761905,0.75,0.75
1,0.388889,0.5,0.5,0.7,0.857143,0.857143
2,0.333333,0.357143,0.357143,0.5,0.555556,0.555556
3,0.607143,0.611111,0.611111,0.944444,0.916667,0.916667
4,0.192308,0.15,0.15,0.357143,0.272727,0.272727


In [None]:

# That was the case for individual words (1-grams), let us now also calculate the same measures for the general n-grams
def jaccard_similarity_ngram(sentence1, sentence2, n):
    # Generate n-grams for both sentences
    ngrams1 = set(ngrams(sentence1, n))
    ngrams2 = set(ngrams(sentence2, n))

    # Handle the case when one or both sentences are too short to have any n-grams
    if not ngrams1 and not ngrams2:
        return 1  # Consider them identical if both are too short
    elif not ngrams1 or not ngrams2:
        return 0  # No overlap if one is too short
    
    # Calculate Jaccard similarity
    return 1 - jaccard_distance(ngrams1, ngrams2)


def containment_measure_ngram(sentence1, sentence2, n):
    # Generate n-grams for both sentences
    ngrams1 = set(ngrams(sentence1, n))
    ngrams2 = set(ngrams(sentence2, n))
    
    # Handle the case when one or both sentences are too short to have any n-grams
    if not ngrams1 and not ngrams2:
        return 1  # Consider them identical if both are too short
    elif not ngrams1 or not ngrams2:
        return 0  # No overlap if one is too short

    # Calculate Jaccard similarity
    return containment_measure(ngrams1, ngrams2)

for name, df in train_dfs.items():
    for n in range(2, 5):
        lexical_features_df[f'{name}_jaccard_{n}gram'] = [jaccard_similarity_ngram(sentence_pair['0'], sentence_pair['1'], n) for _, sentence_pair in df.iterrows()]
        lexical_features_df[f'{name}_containment_{n}gram'] = [containment_measure_ngram(sentence_pair['0'], sentence_pair['1'], n) for _, sentence_pair in df.iterrows()]

lexical_features_df.head()

Unnamed: 0,normal_jaccard,sw_jaccard,lemmas_jaccard,normal_containment,sw_containment,lemmas_containment,normal_jaccard_2gram,normal_containment_2gram,normal_jaccard_3gram,normal_containment_3gram,...,sw_jaccard_3gram,sw_containment_3gram,sw_jaccard_4gram,sw_containment_4gram,lemmas_jaccard_2gram,lemmas_containment_2gram,lemmas_jaccard_3gram,lemmas_containment_3gram,lemmas_jaccard_4gram,lemmas_containment_4gram
0,0.533333,0.473684,0.473684,0.761905,0.75,0.75,0.4,0.636364,0.342857,0.571429,...,0.411765,0.7,0.375,0.666667,0.444444,0.727273,0.411765,0.7,0.375,0.666667
1,0.388889,0.5,0.5,0.7,0.857143,0.857143,0.142857,0.333333,0.047619,0.125,...,0.272727,0.6,0.2,0.5,0.333333,0.666667,0.272727,0.6,0.2,0.5
2,0.333333,0.357143,0.357143,0.5,0.555556,0.555556,0.192308,0.333333,0.074074,0.142857,...,0.0,0.0,0.0,0.0,0.0625,0.125,0.0,0.0,0.0,0.0
3,0.607143,0.611111,0.611111,0.944444,0.916667,0.916667,0.592593,0.941176,0.576923,0.9375,...,0.5625,0.9,0.533333,0.888889,0.588235,0.909091,0.5625,0.9,0.533333,0.888889
4,0.192308,0.15,0.15,0.357143,0.272727,0.272727,0.035714,0.076923,0.0,0.0,...,0.0,0.0,0.0,0.0,0.05,0.1,0.0,0.0,0.0,0.0
