In [4]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.metrics import jaccard_distance
from nltk.util import ngrams
from nltk.wsd import lesk
from collections import Counter
import math
from itertools import chain, product
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Add the project directory to the Python path
project_dir = Path.cwd().parent
sys.path.append(str(project_dir))

from Preprocessing.preprocessingUtils import TextPreprocessor

# Ensure necessary resources are downloaded
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet_ic', quiet=True)
nltk.download('stopwords', quiet=True)

# Load the Information Content (IC) corpus
ic = wordnet_ic.ic('ic-brown.dat')

In [5]:
# Create the TextPreprocessor
preprocessor = TextPreprocessor()

# Load the training dataset
train_df = preprocessor.load_dataset('../Preprocessing/STS_train.csv')

# Display the DataFrame
train_df.head()

Unnamed: 0,0,1,gs
0,"[But, other, sources, close, to, the, sale, sa...","[But, other, sources, close, to, the, sale, sa...",4.0
1,"[Micron, has, declared, its, first, quarterly,...","[Micron, 's, numbers, also, marked, the, first...",3.75
2,"[The, fines, are, part, of, failed, Republican...","[Perry, said, he, backs, the, Senate, 's, effo...",2.8
3,"[The, American, Anglican, Council, ,, which, r...","[The, American, Anglican, Council, ,, which, r...",3.4
4,"[The, tech-loaded, Nasdaq, composite, rose, 20...","[The, technology-laced, Nasdaq, Composite, Ind...",2.4


In [6]:
# Normalize the text
normal_train_df = preprocessor.remove_punctuation(train_df)
normal_train_df = preprocessor.convert_to_lowercase(normal_train_df)
normal_train_df = preprocessor.remove_empty_strings(normal_train_df)

# Create 2 separate DataFrames, one without stopwords and the other also lemmatized
sw_train_df = preprocessor.remove_stopwords(normal_train_df)
lemmas_train_df = preprocessor.lemmatize(sw_train_df)

lemmas_train_df.head()

Unnamed: 0,0,1,gs
0,"[source, close, sale, said, vivendi, keeping, ...","[source, close, sale, said, vivendi, keeping, ...",4.0
1,"[micron, declared, first, quarterly, profit, t...","[micron, number, also, marked, first, quarterl...",3.75
2,"[fine, part, failed, republican, effort, force...","[perry, said, back, senate, effort, including,...",2.8
3,"[american, anglican, council, represents, epis...","[american, anglican, council, represents, epis...",3.4
4,"[tech-loaded, nasdaq, composite, rose, 20.96, ...","[technology-laced, nasdaq, composite, index, i...",2.4


In [7]:
# Word Sense Disambiguation

# Substitute words in a sentence based on Lesk WSD.
def substitute_words(processed_tokens, original_tokens):
    substituted_sentence = []
    for word in processed_tokens:
        # Apply Lesk algorithm using the original sentence context
        synset = lesk(original_tokens, word)
        if synset:
            # Replace with the first synonym that isn't the original word
            substitutes = [lemma.name() for lemma in synset.lemmas() if lemma.name() != word]
            substituted_sentence.append(substitutes[0] if substitutes else word)
        else:
            substituted_sentence.append(word)  # No substitution if no synset found
    return substituted_sentence

# Apply Lesk WSD to the sentences in 'df', substituting words with the most probable match in the synset.
# The original sentences are provided in 'context'.
def lesk_wsd(df, context):
    wsd_df = pd.DataFrame()

    # Apply the substitution to each sentence
    wsd_df['0'] = [substitute_words(df['0'][i], context['0'][i]) for i in range(len(df))]
    wsd_df['1'] = [substitute_words(df['1'][i], context['1'][i]) for i in range(len(df))]
    wsd_df['gs'] = df['gs']
    
    return wsd_df


wsd_train_df = lesk_wsd(lemmas_train_df, normal_train_df)
wsd_train_df.head()

Unnamed: 0,0,1,gs
0,"[reservoir, close, sales_agreement, state, viv...","[reservoir, close, sales_agreement, pronounce,...",4.0
1,"[micrometer, stated, first_base, every_quarter...","[micrometer, phone_number, besides, set, first...",3.75
2,"[fine, function, fail, republican, elbow_greas...","[perry, order, backward, United_States_Senate,...",2.8
3,"[American, Anglican, council, represent, Episc...","[American, Anglican, council, represent, Episc...",3.4
4,"[tech-loaded, National_Association_of_Securiti...","[technology-laced, National_Association_of_Sec...",2.4


In [8]:
# Group the 4 DataFrames into a Dictionary, with their "names"
train_dfs = {'normal': normal_train_df, 'sw': sw_train_df, 'lemmas': lemmas_train_df, 'wsd': wsd_train_df}

In [9]:
# Create the features DataFrame
train_features_df = pd.DataFrame()

In [10]:
# Jaccard similarity
# The first 4 features are the jaccard similarity between the sentence pairs.
for name, df in train_dfs.items():
    train_features_df[f'{name}_jaccard'] = [1 - jaccard_distance(set(sentence_pair['0']), set(sentence_pair['1'])) for _, sentence_pair in df.iterrows()]

train_features_df.head()

Unnamed: 0,normal_jaccard,sw_jaccard,lemmas_jaccard,wsd_jaccard
0,0.533333,0.473684,0.473684,0.4
1,0.388889,0.5,0.5,0.384615
2,0.333333,0.357143,0.357143,0.266667
3,0.607143,0.611111,0.611111,0.318182
4,0.192308,0.15,0.15,0.15


In [11]:
# Containment Measure
# Apart from the Jaccard distance, we also measure the containment measure (Broder, 1997)
def containment_measure(set_a, set_b):
    # Calculate the intersection of both sets
    intersection = set_a.intersection(set_b)
    
    # Return the containment measure
    return len(intersection) / min(len(set_a), len(set_b))

for name, df in train_dfs.items():
    train_features_df[f'{name}_containment'] = [containment_measure(set(sentence_pair['0']), set(sentence_pair['1'])) for _, sentence_pair in df.iterrows()]

train_features_df.head()

Unnamed: 0,normal_jaccard,sw_jaccard,lemmas_jaccard,wsd_jaccard,normal_containment,sw_containment,lemmas_containment,wsd_containment
0,0.533333,0.473684,0.473684,0.4,0.761905,0.75,0.75,0.666667
1,0.388889,0.5,0.5,0.384615,0.7,0.857143,0.857143,0.714286
2,0.333333,0.357143,0.357143,0.266667,0.5,0.555556,0.555556,0.444444
3,0.607143,0.611111,0.611111,0.318182,0.944444,0.916667,0.916667,0.583333
4,0.192308,0.15,0.15,0.15,0.357143,0.272727,0.272727,0.272727


In [12]:

# Word n-grams
# That was the case for individual words (1-grams), let us now also calculate the same measures for the general n-grams
def jaccard_similarity_ngram(sentence1, sentence2, n):
    # Generate n-grams for both sentences
    ngrams1 = set(ngrams(sentence1, n))
    ngrams2 = set(ngrams(sentence2, n))

    # Handle the case when one or both sentences are too short to have any n-grams
    if not ngrams1 and not ngrams2:
        return 1  # Consider them identical if both are too short
    elif not ngrams1 or not ngrams2:
        return 0  # No overlap if one is too short
    
    # Calculate Jaccard similarity
    return 1 - jaccard_distance(ngrams1, ngrams2)


def containment_measure_ngram(sentence1, sentence2, n):
    # Generate n-grams for both sentences
    ngrams1 = set(ngrams(sentence1, n))
    ngrams2 = set(ngrams(sentence2, n))
    
    # Handle the case when one or both sentences are too short to have any n-grams
    if not ngrams1 and not ngrams2:
        return 1  # Consider them identical if both are too short
    elif not ngrams1 or not ngrams2:
        return 0  # No overlap if one is too short

    # Calculate Jaccard similarity
    return containment_measure(ngrams1, ngrams2)

for name, df in train_dfs.items():
    for n in range(2, 4):
        train_features_df[f'{name}_jaccard_{n}gram'] = [jaccard_similarity_ngram(sentence_pair['0'], sentence_pair['1'], n) for _, sentence_pair in df.iterrows()]
        train_features_df[f'{name}_containment_{n}gram'] = [containment_measure_ngram(sentence_pair['0'], sentence_pair['1'], n) for _, sentence_pair in df.iterrows()]

In [13]:
# Pairwise Word Similarity

# Compute IDF weights for a DataFrame.
def compute_idf_weights(df):
    tokens = list(chain.from_iterable(df[['0', '1']].values.flatten()))
    token_counts = Counter(tokens)
    total_docs = len(tokens)
    return {word: math.log(total_docs / (count + 1)) for word, count in token_counts.items()}


idf_weights = compute_idf_weights(normal_train_df)

# Compute word similarity using WordNet.
def word_similarity(word1, word2, similarity_measure):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if not synsets1 or not synsets2:
        return 0  # Return 0 if either word has no synsets

    max_similarity = 0
    for syn1 in synsets1:
        for syn2 in synsets2:
            try:
                sim = similarity_measure(syn1, syn2)
                max_similarity = max(max_similarity, sim)
            except:
                pass
            
    if max_similarity > 20:
        return 20 # Truncate big values to avoid numerical overflow
    return max_similarity

# Calculate pairwise word similarity of a sentence with respect to other
# We use the aggregation strategy by Mihalcea et al. (2006)
def directional_similarity(src_tokens, target_tokens, similarity_measure, idf_weights):
    weighted_similarities = []
    for w in src_tokens:
        w_similarities = [word_similarity(w, target, similarity_measure) for target in target_tokens]
        weighted_w_similarity = max(w_similarities) * idf_weights.get(w, 0)
        weighted_similarities.append(weighted_w_similarity)
    numerator = sum(weighted_similarities)
    denominator = sum(idf_weights.get(w, 0) for w in src_tokens)
    return numerator / denominator if denominator > 0 else 0

# Compute sentence similarity of 2 sentences, averaging their directional sentence similarities
def sentence_similarity(t1_tokens, t2_tokens, similarity_measure, idf_weights):
    sim_t1_to_t2 = directional_similarity(t1_tokens, t2_tokens, similarity_measure, idf_weights)
    sim_t2_to_t1 = directional_similarity(t2_tokens, t1_tokens, similarity_measure, idf_weights)
    
    return 0.5 * (sim_t1_to_t2 + sim_t2_to_t1)

# Define a pipeline to compute Pairwise Word Similarity using parallelization
def compute_pairwise_word_similarities(name, df, features_df):
    # Resnik similarity
    with ThreadPoolExecutor() as executor:
        resnik_results = list(tqdm(
            executor.map(
                lambda sentence_pair: sentence_similarity(sentence_pair['0'], sentence_pair['1'], lambda syn1, syn2: syn1.res_similarity(syn2, ic), idf_weights),
                (sentence_pair for _, sentence_pair in df.iterrows())
            ),
            total=len(df),
            desc=f"Computing {name} Resnik similarity"
        ))
    features_df[f'{name}_resnik_similarity'] = resnik_results

    # Normalize Resnik similarity
    min_resnik_sim = min(resnik_results)
    max_resnik_sim = max(resnik_results)
    features_df[f'{name}_resnik_similarity'] = [(res - min_resnik_sim) / (max_resnik_sim - min_resnik_sim) for res in resnik_results]

    # Lin similarity
    with ThreadPoolExecutor() as executor:
        lin_results = list(tqdm(
            executor.map(
                lambda sentence_pair: sentence_similarity(sentence_pair['0'], sentence_pair['1'], lambda syn1, syn2: syn1.lin_similarity(syn2, ic), idf_weights),
                (sentence_pair for _, sentence_pair in df.iterrows())
            ),
            total=len(df),
            desc=f"Computing {name} Lin similarity"
        ))
    features_df[f'{name}_lin_similarity'] = lin_results

    # Normalize Lin similarity
    min_lin_sim = min(lin_results)
    max_lin_sim = max(lin_results)
    features_df[f'{name}_lin_similarity'] = [(lin - min_lin_sim) / (max_lin_sim - min_lin_sim) for lin in lin_results]


In [14]:
# for name, df in {'lemmas': lemmas_train_df, 'wsd': wsd_train_df}.items():
#     compute_pairwise_word_similarities(name, df, train_features_df)

In [15]:
# WordNet Augmented Word Overlap

# Compute the maximum path similarity between two words using WordNet.
def path_similarity(w1, w2):
    synsets1 = wn.synsets(w1)
    synsets2 = wn.synsets(w2)
    
    if not synsets1 or not synsets2:
        return 0.0
    
    max_sim = 0.0
    for syn1 in synsets1:
        for syn2 in synsets2:
            try:
                sim = syn1.path_similarity(syn2, simulate_root = False)
                if sim and sim > max_sim:
                    max_sim = sim
            except:
                continue
    return max_sim

# Compute the score of a word against a sentence.
def score(w, S):
    if w in S:
        return 1
    return max(path_similarity(w, w_prime) for w_prime in S)

# Compute the WordNet-augmented coverage for two sentences S1 and S2.
def P_WN(S1, S2):
    S2_set = set(S2)
    return sum(score(w, S2_set) for w in S1) / len(S2)

# Compute the harmonic mean of P_WN(S1, S2) and P_WN(S2, S1).
def wordnet_augmented_word_overlap(S1, S2):
    P1 = P_WN(S1, S2)
    P2 = P_WN(S2, S1)
    
    if P1 + P2 == 0:
        return 0
    
    return 2 * P1 * P2 / (P1 + P2)

def compute_wordnet_augmented_word_overlap(dfs, features_df):
    for name, df in dfs.items():
        with ThreadPoolExecutor() as executor:
            results = list(tqdm(
                executor.map(
                    lambda sentence_pair: wordnet_augmented_word_overlap(sentence_pair['0'], sentence_pair['1']),
                    (sentence_pair for _, sentence_pair in df.iterrows())
                ),
                total=len(df),
                desc=f"Computing WordNet-Augmented Word Overlap {name}"
            ))
        features_df[f'{name}_wn_aug_overlap'] = results

In [16]:
# compute_wordnet_augmented_word_overlap({'lemmas': lemmas_train_df, 'wsd': wsd_train_df}, train_features_df)

In [44]:
# Weighted Word Overlap

# Extract Information Content of a synset from WordNet IC
def information_content(synset):
    pos = synset._pos
    if pos != 'n' and pos != 'v':
            return 0

    icpos = ic[pos]

    counts = icpos[synset._offset]
    if counts == 0:
        return 1e7
    else:
        return -math.log(counts / icpos[0])

# Calculate the IC of a word through its most likely synset.
# Returns 0 if the word is not found in WordNet.
def calculate_ic(word):
    synsets = wn.synsets(word)
    if not synsets:
        return 0  # Word not in WordNet
    synset = synsets[0]  # Use the first synset for simplicity
    return information_content(synset)

# Compute the weighted word overlap between two sentences.
def weighted_word_overlap(s1, s2):
    # Compute IC for words in both sentences
    ic_words1 = {word: calculate_ic(word) for word in s1}
    ic_words2 = {word: calculate_ic(word) for word in s2}
    
    # Compute WWC(S1, S2)
    common_words = set(s1).intersection(s2)
    numerator1 = sum(ic_words1[word] for word in common_words)
    denominator1 = sum(ic_words2[word] for word in s2)
    wwc_s1_s2 = numerator1 / denominator1 if denominator1 > 0 else 0
    
    # Compute WWC(S2, S1)
    numerator2 = sum(ic_words2[word] for word in common_words)
    denominator2 = sum(ic_words1[word] for word in s1)
    wwc_s2_s1 = numerator2 / denominator2 if denominator2 > 0 else 0
    
    # Harmonic mean of WWC(S1, S2) and WWC(S2, S1)
    if wwc_s1_s2 + wwc_s2_s1 == 0:
        return 0
    return (2 * wwc_s1_s2 * wwc_s2_s1) / (wwc_s1_s2 + wwc_s2_s1)

for name, df in {'lemmas': lemmas_train_df, 'wsd': wsd_train_df}.items():
    train_features_df[f'{name}_weighted_overlap'] = [weighted_word_overlap(sentence_pair['0'], sentence_pair['1']) for _, sentence_pair in df.iterrows()]

In [39]:

# Calculate Lin similarity between two lemmas using WordNet.
# Returns the maximum similarity among all possible synset pairs.
def lin_similarity(lemma1, lemma2):
    synsets1 = wn.synsets(lemma1)
    synsets2 = wn.synsets(lemma2)
    
    if not synsets1 or not synsets2:
        return 0.0
    
    max_sim = 0.0
    for syn1 in synsets1:
        for syn2 in synsets2:
            try:
                sim = syn1.lin_similarity(syn2, ic)
                if sim and sim > max_sim:
                    max_sim = sim
            except:
                continue
    return max_sim

# Perform greedy alignment of lemmas between two sentences.
# Returns set of aligned lemma pairs.
def greedy_lemma_alignment(sent1, sent2):
    # Create similarity matrix
    similarities = np.zeros((len(sent1), len(sent2)))
    for i, lemma1 in enumerate(sent1):
        for j, lemma2 in enumerate(sent2):
            similarities[i, j] = lin_similarity(lemma1, lemma2)
    
    # Greedily align lemmas
    aligned_pairs = set()
    used_indices1 = set()
    used_indices2 = set()
    
    while len(used_indices1) < len(sent1) and len(used_indices2) < len(sent2):
        # Find highest remaining similarity
        max_sim = -1
        max_i = -1
        max_j = -1
        
        for i in range(len(sent1)):
            if i in used_indices1:
                continue
            for j in range(len(sent2)):
                if j in used_indices2:
                    continue
                if similarities[i, j] > max_sim:
                    max_sim = similarities[i, j]
                    max_i = i
                    max_j = j
        
        if max_sim <= 0:
            break
            
        aligned_pairs.add((sent1[max_i], sent2[max_j]))
        used_indices1.add(max_i)
        used_indices2.add(max_j)
    
    return aligned_pairs

# Compute Greedy Lemma Aligning Overlap score between two sentences.
def greedy_lemma_aligning_overlap(sent1, sent2):
    if not sent1 or not sent2:
        return 0.0
        
    # Get aligned pairs
    aligned_pairs = greedy_lemma_alignment(sent1, sent2)
    
    # Compute similarity score for each pair
    total_sim = 0.0
    for lemma1, lemma2 in aligned_pairs:
        # Get information content for each lemma
        ic1 = max(information_content(syn) for syn in wn.synsets(lemma1)) if wn.synsets(lemma1) else 0
        ic2 = max(information_content(syn) for syn in wn.synsets(lemma2)) if wn.synsets(lemma2) else 0
        
        # Compute semantic similarity
        ssim = lin_similarity(lemma1, lemma2)
        
        # Weigh the similarity by the max IC
        pair_sim = max(ic1, ic2) * ssim
        total_sim += pair_sim
    
    # Normalize by length of longer sentence
    normalization = max(len(sent1), len(sent2))
    if normalization == 0:
        return 0.0
        
    return total_sim / normalization

def compute_greedy_lemma_aligning_overlap(dfs, features_df):
    for name, df in dfs.items():
        with ThreadPoolExecutor() as executor:
            results = list(tqdm(
                executor.map(
                    lambda sentence_pair: greedy_lemma_aligning_overlap(sentence_pair['0'], sentence_pair['1']),
                    (sentence_pair for _, sentence_pair in df.iterrows())
                ),
                total=len(df),
                desc=f"Computing Greedy Lemma Aligning Overlap {name}"
            ))
        features_df[f'{name}_glao'] = results

In [40]:
compute_greedy_lemma_aligning_overlap({'lemmas': lemmas_train_df, 'wsd': wsd_train_df}, train_features_df)

Computing Greedy Lemma Aligning Overlap lemmas: 100%|██████████| 2234/2234 [00:48<00:00, 45.94it/s] 
Computing Greedy Lemma Aligning Overlap wsd: 100%|██████████| 2234/2234 [00:37<00:00, 59.38it/s]


In [None]:
train_features_df['gs'] = train_df['gs'] / 5.0

train_features_df.to_csv('train/lexicalFeatures_train.csv', index=False)

In [None]:
# Apply the same pipeline to the testing Dataset

# Load the testing dataset
test_df = preprocessor.load_dataset('../Preprocessing/STS_test.csv')

# Normalize the text
normal_test_df = preprocessor.remove_punctuation(test_df)
normal_test_df = preprocessor.convert_to_lowercase(normal_test_df)
normal_test_df = preprocessor.remove_empty_strings(normal_test_df)

# Create 2 separate DataFrames, one without stopwords and the other also lemmatized
sw_test_df = preprocessor.remove_stopwords(normal_test_df)
lemmas_test_df = preprocessor.lemmatize(sw_test_df)

# Word Sense Disambiguation
wsd_test_df = lesk_wsd(lemmas_test_df, normal_test_df)

# Group the 4 DataFrames into a Dictionary, with their "names"
test_dfs = {'normal': normal_test_df, 'sw': sw_test_df, 'lemmas': lemmas_test_df, 'wsd': wsd_test_df}

# Create the features DataFrame
test_features_df = pd.DataFrame()

# Jaccard similarity and containment measure
for name, df in test_dfs.items():
    test_features_df[f'{name}_jaccard'] = [1 - jaccard_distance(set(sentence_pair['0']), set(sentence_pair['1'])) for _, sentence_pair in df.iterrows()]
    test_features_df[f'{name}_containment'] = [containment_measure(set(sentence_pair['0']), set(sentence_pair['1'])) for _, sentence_pair in df.iterrows()]

# Word n-grams
for name, df in test_dfs.items():
    for n in range(2, 4):
        test_features_df[f'{name}_jaccard_{n}gram'] = [jaccard_similarity_ngram(sentence_pair['0'], sentence_pair['1'], n) for _, sentence_pair in df.iterrows()]
        test_features_df[f'{name}_containment_{n}gram'] = [containment_measure_ngram(sentence_pair['0'], sentence_pair['1'], n) for _, sentence_pair in df.iterrows()]

# Pairwise Word Similarity
for name, df in {'lemmas': lemmas_test_df, 'wsd': wsd_test_df}.items():
    compute_pairwise_word_similarities(name, df, test_features_df)

# WordNet Augmented Word Overlap
compute_wordnet_augmented_word_overlap({'lemmas': lemmas_test_df, 'wsd': wsd_test_df}, test_features_df)

# Weighted Word Overlap
for name, df in {'lemmas': lemmas_test_df, 'wsd': wsd_test_df}.items():
    test_features_df[f'{name}_weighted_overlap'] = [weighted_word_overlap(sentence_pair['0'], sentence_pair['1']) for _, sentence_pair in df.iterrows()]

# Greedy Lemma Aligning Overlap
compute_greedy_lemma_aligning_overlap({'lemmas': lemmas_train_df, 'wsd': wsd_train_df}, train_features_df)

KeyboardInterrupt: 

In [None]:
test_features_df['gs'] = test_df['gs'] / 5.0

test_features_df.to_csv('test/lexicalFeatures_test.csv', index=False)

# Personal Notes
Maybe delete before turn in

In [None]:
from scipy.stats import pearsonr

correlations = {}
for column in train_features_df.columns:
    corr, _ = pearsonr(train_features_df[column], train_df['gs'])
    correlations[column] = corr

# Convert the dictionary to a DataFrame for tabular representation
correlation_table = pd.DataFrame(list(correlations.items()), columns=['Variable', 'Correlation'])

print(correlation_table)

                    Variable  Correlation
0             normal_jaccard     0.476515
1                 sw_jaccard     0.600506
2             lemmas_jaccard     0.610119
3                wsd_jaccard     0.515652
4         normal_containment     0.481160
5             sw_containment     0.616511
6         lemmas_containment     0.626920
7            wsd_containment     0.541817
8       normal_jaccard_2gram     0.334975
9   normal_containment_2gram     0.354762
10      normal_jaccard_3gram     0.279903
11  normal_containment_3gram     0.286566
12          sw_jaccard_2gram     0.421352
13      sw_containment_2gram     0.443682
14          sw_jaccard_3gram     0.155031
15      sw_containment_3gram     0.219161
16      lemmas_jaccard_2gram     0.425503
17  lemmas_containment_2gram     0.448294
18      lemmas_jaccard_3gram     0.158728
19  lemmas_containment_3gram     0.223452
20         wsd_jaccard_2gram     0.359828
21     wsd_containment_2gram     0.384835
22         wsd_jaccard_3gram     0

  corr, _ = pearsonr(train_features_df[column], train_df['gs'])


Consider removing 4grams, as they have negative correlation. Maybe that could be useful too, as "negative examples"?

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K
from scipy.stats import pearsonr

df = pd.read_csv('train/lexicalFeatures_train.csv')
test_df = pd.read_csv('test/lexicalFeatures_test.csv')

# Assuming 'df' is the training DataFrame, 'gs' is the target column,
# and 'test_df' is the separate testing DataFrame
X = df.drop(columns=['gs']).values
y = df['gs'].values

X_test = test_df.drop(columns=['gs']).values
y_test = test_df['gs'].values

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Define the model
def create_model(input_dim):
    model = Sequential()
    model.add(Input((input_dim,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(1))  # For regression, single output node
    model.compile(optimizer=Adam(), loss='mean_squared_error')
    return model

# n-fold cross-validation
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
best_model = None
best_pearson = -np.inf  # Initialize with a very low value

for train_idx, val_idx in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Create the model for each fold
    model = create_model(X_train.shape[1])
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    
    # Predict on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate the Pearson correlation
    corr, _ = pearsonr(y_val, y_pred.flatten())
    
    # Save the model if it achieves the best Pearson correlation
    if corr > best_pearson:
        best_pearson = corr
        best_model = model

# Test the best model on the separate testing data
y_test_pred = best_model.predict(X_test_scaled)
test_corr, _ = pearsonr(y_test, y_test_pred.flatten())
print(f'Best Pearson Correlation on Validation Set: {best_pearson}')
print(f'Pearson Correlation on Testing Set: {test_corr}')

# Optionally, clear the session to free memory
K.clear_session()


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m98/98[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 471us/step
Best Pearson Correlation on Validation Set: 0.821299965620997
Pearson Correlation on Testing Set: 0.5821285400465993

