In [1]:
from gensim.downloader import load as gensim_load

# Load pre-trained embeddings
fasttext = gensim_load('fasttext-wiki-news-subwords-300')  # FastText



In [2]:
import warnings
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVR
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

In [3]:
# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# ====== Dataset Loading and Splitting ======
def load_custom_dataset(filename):
    data = []
    with open(filename, "r") as file:
        for line in file:
            question, response, answer, label = line.strip().split('\t')
            label = float(label) / 5.0  # Normalize to [0, 1]
            data.append((response, answer, label))
    return data

In [5]:
def split_dataset(data, valid_percentage, test_percentage):
    length = len(data)
    random.shuffle(data)
    train = data[:int(length * (1 - valid_percentage - test_percentage))]
    valid = data[int(length * (1 - valid_percentage - test_percentage)):int(length * (1 - test_percentage))]
    test = data[int(length * (1 - test_percentage)):]
    return train, valid, test

In [6]:
# ====== Text Preprocessing ======
def preprocess_text(text):
    # Normalize the text by replacing curly apostrophes with straight ones
    text = text.replace("‘", "'").replace("’", "'").lower()  # Case folding and normalization

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove non-alphabetic tokens and stopwords
    stop_words = set(nltk.corpus.stopwords.words("english"))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

In [7]:
def preprocess_data(data):
    sentence1 = [preprocess_text(item[0]) for item in data]
    sentence2 = [preprocess_text(item[1]) for item in data]
    labels = [item[2] for item in data]
    return sentence1, sentence2, labels

In [8]:
import pandas as pd

# Function to save raw and preprocessed data
def save_raw_and_preprocessed(raw_data, preprocessed_data1, preprocessed_data2, labels, filename):
    # Convert to DataFrame
    df = pd.DataFrame({
        "Raw Sentence 1": [item[0] for item in raw_data],
        "Raw Sentence 2": [item[1] for item in raw_data],
        "Preprocessed Sentence 1": preprocessed_data1,
        "Preprocessed Sentence 2": preprocessed_data2,
        "Label": labels
    })
    df.to_csv(filename, index=False)
    print(f"Saved dataset to {filename}")

In [9]:
# ====== Load Dataset ======
# Load train, validation, and test datasets from CSV files
train_file = "/content/train_data_ENG-W2V.csv"
valid_file = "/content/valid_data_ENG-W2V.csv"
test_file = "/content/test_data_ENG-W2V.csv"

# Read datasets
train_data = pd.read_csv(train_file).values
valid_data = pd.read_csv(valid_file).values
test_data = pd.read_csv(test_file).values

# ====== Preprocessing ======
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

# Output shapes for verification
print(f"Train data: {len(x_train1)} pairs, {len(y_train)} labels")
print(f"Validation data: {len(x_valid1)} pairs, {len(y_valid)} labels")
print(f"Test data: {len(x_test1)} pairs, {len(y_test)} labels")

Train data: 2916 pairs, 2916 labels
Validation data: 365 pairs, 365 labels
Test data: 365 pairs, 365 labels


In [10]:
# Check a few random samples from the preprocessed training data
print("Preprocessed x_train1 Samples:")
for i in random.sample(range(len(x_train1)), 5):  # Randomly select 5 indices
    print(f"Original Sentence 1: {train_data[i][0]}")
    print(f"Preprocessed Sentence 1: {x_train1[i]}")
    print()

print("Preprocessed x_train2 Samples:")
for i in random.sample(range(len(x_train2)), 5):  # Randomly select 5 indices
    print(f"Original Sentence 2: {train_data[i][1]}")
    print(f"Preprocessed Sentence 2: {x_train2[i]}")
    print()

Preprocessed x_train1 Samples:
Original Sentence 1: An alias - LRB - synonym - RRB - for the name of the object that its operand points to in memory It is the dereferencing operator
Preprocessed Sentence 1: alias lrb synonym rrb name object operand points memory dereferencing operator

Original Sentence 1: A function prototype tells the compiler the function name return type and the number and type of parameters without revealing the implementations contained in the function definition
Preprocessed Sentence 1: function prototype tells compiler function name return type number type parameters without revealing implementations contained function definition

Original Sentence 1: the type char has a null - LRB - n - RRB - element at the very end
Preprocessed Sentence 1: type char null lrb n rrb element end

Original Sentence 1: they take up twice as much memory for each node
Preprocessed Sentence 1: take twice much memory node

Original Sentence 1: It selects the minimum from an array and 

In [11]:
import numpy as np

def avgCos(sentences1, sentences2, embedding):
    similarities = []
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [token for token in sent1.split() if token in embedding.key_to_index]
        tokens2 = [token for token in sent2.split() if token in embedding.key_to_index]

        if not tokens1 or not tokens2:
            similarities.append(0)
            continue

        vec1 = np.mean([embedding.get_vector(token) for token in tokens1], axis=0)
        vec2 = np.mean([embedding.get_vector(token) for token in tokens2], axis=0)

        similarities.append(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))

    return np.array(similarities).reshape(-1, 1)


In [12]:
def word_distance(sentences1, sentences2, embedding):
    sims = []
    for sent1, sent2 in zip(sentences1, sentences2):
        sent1_tokens = [token for token in sent1.split() if token in embedding.key_to_index]
        sent2_tokens = [token for token in sent2.split() if token in embedding.key_to_index]

        if not sent1_tokens or not sent2_tokens:
            sims.append(0)
        else:
            sims.append(-embedding.wmdistance(sent1_tokens, sent2_tokens))  # Lower is better

    return np.array(sims).reshape(-1, 1)


In [13]:
from sklearn.decomposition import TruncatedSVD

def remove_first_principal_component(X):
    svd = TruncatedSVD(n_components=1, random_state=42)
    svd.fit(X)
    pc = svd.components_
    return X - X.dot(pc.T) * pc

def sif_cos(sentences1, sentences2, embedding, freqs, a=0.001):
    total_freq = sum(freqs.values())
    embeddings = []

    for sent1, sent2 in zip(sentences1, sentences2):
        sent1_tokens = [token for token in sent1.split() if token in embedding.key_to_index]
        sent2_tokens = [token for token in sent2.split() if token in embedding.key_to_index]

        if not sent1_tokens or not sent2_tokens:
            embeddings.extend([np.zeros(embedding.vector_size), np.zeros(embedding.vector_size)])
            continue

        weights1 = [a / (a + freqs.get(token, 1e-5) / total_freq) for token in sent1_tokens]
        weights2 = [a / (a + freqs.get(token, 1e-5) / total_freq) for token in sent2_tokens]

        embedding1 = np.average([embedding.get_vector(token) for token in sent1_tokens], axis=0, weights=weights1)
        embedding2 = np.average([embedding.get_vector(token) for token in sent2_tokens], axis=0, weights=weights2)

        embeddings.extend([embedding1, embedding2])

    embeddings = np.array(embeddings)
    embeddings = remove_first_principal_component(embeddings)

    sims = [
        (np.dot(embeddings[i], embeddings[i + 1]) /
         (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i + 1]))
         if np.linalg.norm(embeddings[i]) > 0 and np.linalg.norm(embeddings[i + 1]) > 0 else 0)
        for i in range(0, len(embeddings), 2)
    ]

    return np.array(sims).reshape(-1, 1)


In [14]:
# ====== Regression Model ======
class RegressionModel:
    def __init__(self, model_type="linear"):
        if model_type == "linear":
            self.model = self.LinearRegressionCustom()
        elif model_type == "svr":
            self.model = SVR(kernel="linear")
        elif model_type == "rfr":
            self.model = self.RandomForestCustom()
        else:
            raise ValueError("Unsupported model type.")

    class LinearRegressionCustom:
        def __init__(self):
            self.weights = None

        def fit(self, X, y):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y
        def fit(self, X, y):
            # Convert y to NumPy and ensure matching rows
            y = np.array(y)
            if X.shape[0] != y.shape[0]:
                raise ValueError(f"Shape mismatch: X has {X.shape[0]} rows but y has {y.shape[0]} rows.")

            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y

        def predict(self, X):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            return X @ self.weights

    class RandomForestCustom:
        def __init__(self, n_estimators=100, max_depth=None):
            self.n_estimators = n_estimators
            self.max_depth = max_depth
            self.trees = []

        def fit(self, X, y):
            from sklearn.tree import DecisionTreeRegressor

            # Ensure y is a NumPy array
            y = np.array(y)

            n_samples = X.shape[0]

            for _ in range(self.n_estimators):
                # Ensure indices are integers for proper indexing
                indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
                X_sample = X[indices]
                y_sample = y[indices]
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X_sample, y_sample)
                self.trees.append(tree)

        def predict(self, X):
            # Aggregate predictions from all trees
            predictions = np.array([tree.predict(X) for tree in self.trees])
            return np.mean(predictions, axis=0)

    @staticmethod
    def mean_squared_error(y_true, y_pred):
        squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
        return sum(squared_errors) / len(squared_errors)

    @staticmethod
    def mean_absolute_error(y_true, y_pred):
        absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
        return sum(absolute_errors) / len(absolute_errors)

    @staticmethod
    def pearsonr(x, y):
        mean_x = sum(x) / len(x)
        mean_y = sum(y) / len(y)
        numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
        denominator = ((sum((xi - mean_x) ** 2 for xi in x) * sum((yi - mean_y) ** 2 for yi in y)) ** 0.5)
        return (numerator / denominator if denominator != 0 else 0.0, None)

    def train(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def evaluate(self, x, y):
        predictions = self.model.predict(x)
        mse = self.mean_squared_error(y, predictions)
        mae = self.mean_absolute_error(y, predictions)
        pearson_corr, _ = self.pearsonr(y, predictions)
        return mse, mae, pearson_corr

    def predict(self, x):
        return self.model.predict(x)

In [15]:
def feature_extraction(train_set1, train_set2, val_set1, val_set2, test_set, embed_model, frequency_map, method):
    if method == "averageCosine":
        train_similarities = average_cosine_similarity(train_set1, train_set2, embed_model)
        val_similarities = average_cosine_similarity(val_set1, val_set2, embed_model)
        test_similarities = average_cosine_similarity(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "wordDis":
        train_similarities = word_distance(train_set1, train_set2, embed_model)
        val_similarities = word_distance(val_set1, val_set2, embed_model)
        test_similarities = word_distance(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "sifCos":
        train_similarities = sif_cos(train_set1, train_set2, embed_model, frequency_map)
        val_similarities = sif_cos(val_set1, val_set2, embed_model, frequency_map)
        test_similarities = sif_cos(test_set["sentence1"], test_set["sentence2"], embed_model, frequency_map)
    else:
        raise ValueError(f"Feature extraction method '{method}' is not supported.")

    return np.array(train_similarities), np.array(val_similarities), np.array(test_similarities)

In [16]:
# ====== Testing Predictions ======
def print_test_predictions(model, x_test_features, x_test1, x_test2, y_test):
    predictions = model.predict(x_test_features)
    true_scores = np.array(y_test) * 5.0
    predicted_scores = predictions * 5.0
    pearson_corr, _ = RegressionModel.pearsonr(true_scores, predicted_scores)
    results = pd.DataFrame({
        "Sentence 1": x_test1,
        "Sentence 2": x_test2,
        "True Similarity Score": true_scores,
        "Predicted Similarity Score": predicted_scores
    })
    print(results.head(10))
    print(f"Pearson Correlation: {pearson_corr:.4f}")
    results.to_csv("train_predictions.csv", index=False)

In [17]:
pip install cvxopt



In [18]:
pip install POT

Collecting POT
  Downloading POT-0.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading POT-0.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.5


In [19]:
import pandas as pd

# ====== Frequency Computation for SIF ======
from collections import Counter
all_sentences = x_train1 + x_train2 + x_valid1 + x_valid2 + x_test1 + x_test2
# Compute word frequencies for SIF
all_tokens = [token for sentence in all_sentences for token in sentence.split()]
freqs = Counter(all_tokens)

In [20]:
fasttext["king"]


array([-1.2063e-01,  5.1695e-03, -1.2447e-02, -7.8528e-03, -2.3738e-02,
       -8.2595e-02,  4.5790e-02, -1.5382e-01,  6.4550e-02,  1.2893e-01,
        2.7643e-02,  1.5958e-02,  7.7559e-02,  6.0516e-02,  1.2737e-01,
        8.4766e-02,  6.3890e-02, -1.7687e-01,  4.3017e-02, -1.8031e-02,
       -3.3041e-02,  2.1930e-02, -1.1328e-02,  6.6453e-02,  1.5826e-01,
       -2.3008e-02, -4.3616e-03, -2.2379e-02,  4.4891e-02,  3.0103e-03,
       -1.5565e-02, -7.6785e-02, -9.2186e-02,  5.7907e-02, -2.7658e-02,
        5.4500e-03,  1.8975e-02,  4.2939e-02,  3.4704e-03,  4.0449e-02,
       -4.0245e-03, -1.1594e-01, -5.8337e-03,  3.2509e-02, -8.6535e-02,
        7.2000e-02, -2.2299e-02,  1.3079e-02, -3.9515e-02,  6.8996e-02,
        9.2300e-02, -7.5371e-02,  5.9412e-03, -3.4945e-02, -3.3417e-02,
       -9.9982e-02,  1.6438e-02,  6.3739e-02, -6.2391e-02,  7.8285e-04,
       -2.9210e-02, -9.6416e-02,  7.2910e-02,  4.5905e-02, -8.3387e-02,
        7.1969e-02,  4.0932e-02, -5.6454e-03,  1.3709e-01, -1.17

In [21]:
from collections import Counter

# Compute word frequencies
all_sentences = x_train1 + x_train2 + x_valid1 + x_valid2 + x_test1 + x_test2
all_tokens = [token for sentence in all_sentences for token in sentence.split()]
freqs = Counter(all_tokens)


In [22]:
# Extract Features
x_train_features_avg = avgCos(x_train1, x_train2, fasttext)
x_valid_features_avg = avgCos(x_valid1, x_valid2, fasttext)
x_test_features_avg = avgCos(x_test1, x_test2, fasttext)

x_train_features_wmd = word_distance(x_train1, x_train2, fasttext)
x_valid_features_wmd = word_distance(x_valid1, x_valid2, fasttext)
x_test_features_wmd = word_distance(x_test1, x_test2, fasttext)

x_train_features_sif = sif_cos(x_train1, x_train2, fasttext, freqs)
x_valid_features_sif = sif_cos(x_valid1, x_valid2, fasttext, freqs)
x_test_features_sif = sif_cos(x_test1, x_test2, fasttext, freqs)


In [23]:
import warnings
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from gensim.downloader import load as gensim_load
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from collections import Counter
from sklearn.decomposition import TruncatedSVD

In [24]:
# Feature Extraction Methods
feature_methods = {
    "avgCos": {"train": x_train_features_avg, "valid": x_valid_features_avg, "test": x_test_features_avg},
    "wordDis": {"train": x_train_features_wmd, "valid": x_valid_features_wmd, "test": x_test_features_wmd},
    "sifCos": {"train": x_train_features_sif, "valid": x_valid_features_sif, "test": x_test_features_sif}
}

# Train and Evaluate Models
models = {"linear": RegressionModel("linear"), "svr": RegressionModel("svr"), "rfr": RegressionModel("rfr")}
results = []

for feature_name, feature_data in feature_methods.items():
    for model_name, model in models.items():
        model.train(feature_data["train"], y_train)
        val_mse, val_mae, val_pearson = model.evaluate(feature_data["valid"], y_valid)
        test_mse, test_mae, test_pearson = model.evaluate(feature_data["test"], y_test)

        print(f"Validation MSE ({feature_name}-{model_name}): {val_mse:.4f}, MAE: {val_mae:.4f}, Pearson: {val_pearson:.4f}")
        print(f"Test MSE ({feature_name}-{model_name}): {test_mse:.4f}, MAE: {test_mae:.4f}, Pearson: {test_pearson:.4f}")

        predictions = model.predict(feature_data["test"])
        results.append({
            "feature": feature_name,
            "model": model_name,
            "test_predictions": predictions,
            "x_test1": x_test1,
            "x_test2": x_test2,
            "y_test": y_test
        })

# Save all results as CSV files
for i, result in enumerate(results, start=1):
    raw_test1 = [item[0] for item in test_data]
    raw_test2 = [item[1] for item in test_data]
    test_df = pd.DataFrame({
        "Original Sentence 1": raw_test1,
        "Original Sentence 2": raw_test2,
        "Preprocessed Sentence 1": result["x_test1"],
        "Preprocessed Sentence 2": result["x_test2"],
        "True Similarity Score": [y * 5 for y in result["y_test"]],
        "Predicted Similarity Score": [y * 5 for y in result["test_predictions"]]
    })
    test_filename = f"test_result_{i}_{result['feature']}_{result['model']}.csv"
    test_df.to_csv(test_filename, index=False)
    print(f"Saved test result: {test_filename}")


Validation MSE (avgCos-linear): 0.0442, MAE: 0.1717, Pearson: 0.3674
Test MSE (avgCos-linear): 0.0448, MAE: 0.1716, Pearson: 0.4414
Validation MSE (avgCos-svr): 0.0440, MAE: 0.1636, Pearson: 0.3674
Test MSE (avgCos-svr): 0.0463, MAE: 0.1679, Pearson: 0.4414
Validation MSE (avgCos-rfr): 0.0523, MAE: 0.1680, Pearson: 0.3860
Test MSE (avgCos-rfr): 0.0560, MAE: 0.1767, Pearson: 0.3928
Validation MSE (wordDis-linear): 0.0435, MAE: 0.1677, Pearson: 0.3871
Test MSE (wordDis-linear): 0.0481, MAE: 0.1715, Pearson: 0.3676
Validation MSE (wordDis-svr): 0.0431, MAE: 0.1612, Pearson: 0.3871
Test MSE (wordDis-svr): 0.0498, MAE: 0.1704, Pearson: 0.3676
Validation MSE (wordDis-rfr): 0.0805, MAE: 0.2536, Pearson: 0.4177
Test MSE (wordDis-rfr): 0.0744, MAE: 0.2387, Pearson: 0.3718
Validation MSE (sifCos-linear): 0.0479, MAE: 0.1745, Pearson: 0.2694
Test MSE (sifCos-linear): 0.0487, MAE: 0.1693, Pearson: 0.3476
Validation MSE (sifCos-svr): 0.0473, MAE: 0.1665, Pearson: 0.2694
Test MSE (sifCos-svr): 0.050