In [2]:
import warnings
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVR
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

In [3]:
# Ensure required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# ====== Dataset Loading and Splitting ======
def load_custom_dataset(filename):
    data = []
    with open(filename, "r") as file:
        for line in file:
            question, response, answer, label = line.strip().split('\t')
            label = float(label) / 5.0  # Normalize to [0, 1]
            data.append((response, answer, label))
    return data

In [None]:
def split_dataset(data, valid_percentage, test_percentage):
    length = len(data)
    random.shuffle(data)
    train = data[:int(length * (1 - valid_percentage - test_percentage))]
    valid = data[int(length * (1 - valid_percentage - test_percentage)):int(length * (1 - test_percentage))]
    test = data[int(length * (1 - test_percentage)):]
    return train, valid, test

In [5]:
# ====== Text Preprocessing ======
def preprocess_text(text):
    # Normalize the text by replacing curly apostrophes with straight ones
    text = text.replace("‘", "'").replace("’", "'").lower()  # Case folding and normalization
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove non-alphabetic tokens and stopwords
    stop_words = set(nltk.corpus.stopwords.words("english"))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

In [6]:
def preprocess_data(data):
    sentence1 = [preprocess_text(item[0]) for item in data]
    sentence2 = [preprocess_text(item[1]) for item in data]
    labels = [item[2] for item in data]
    return sentence1, sentence2, labels

In [None]:
import pandas as pd

# Function to save raw and preprocessed data
def save_raw_and_preprocessed(raw_data, preprocessed_data1, preprocessed_data2, labels, filename):
    # Convert to DataFrame
    df = pd.DataFrame({
        "Raw Sentence 1": [item[0] for item in raw_data],
        "Raw Sentence 2": [item[1] for item in raw_data],
        "Preprocessed Sentence 1": preprocessed_data1,
        "Preprocessed Sentence 2": preprocessed_data2,
        "Label": labels
    })
    df.to_csv(filename, index=False)
    print(f"Saved dataset to {filename}")

In [None]:
import random
# ====== Main Workflow ======
# Load and preprocess dataset
raw_data = load_custom_dataset("/kaggle/input/dataset/expand.txt")
train_data, valid_data, test_data = split_dataset(raw_data, valid_percentage=0.1, test_percentage=0.1)

In [7]:
# ====== Load Dataset ======
# Load train, validation, and test datasets from CSV files
train_file = "/kaggle/input/w2v-dataset/train_data_ENG-W2V.csv"
valid_file = "/kaggle/input/w2v-dataset/valid_data_ENG-W2V.csv"
test_file = "/kaggle/input/w2v-dataset/test_data_ENG-W2V.csv"

# Read datasets
train_data = pd.read_csv(train_file).values
valid_data = pd.read_csv(valid_file).values
test_data = pd.read_csv(test_file).values

# ====== Preprocessing ======
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

# Output shapes for verification
print(f"Train data: {len(x_train1)} pairs, {len(y_train)} labels")
print(f"Validation data: {len(x_valid1)} pairs, {len(y_valid)} labels")
print(f"Test data: {len(x_test1)} pairs, {len(y_test)} labels")


Train data: 2916 pairs, 2916 labels
Validation data: 365 pairs, 365 labels
Test data: 365 pairs, 365 labels


In [None]:
# Save raw and preprocessed datasets
save_raw_and_preprocessed(train_data, x_train1, x_train2, y_train, "train_data_with_preprocessing.csv")
save_raw_and_preprocessed(valid_data, x_valid1, x_valid2, y_valid, "valid_data_with_preprocessing.csv")
save_raw_and_preprocessed(test_data, x_test1, x_test2, y_test, "test_data_with_preprocessing.csv")

In [8]:
# Check a few random samples from the preprocessed training data
print("Preprocessed x_train1 Samples:")
for i in random.sample(range(len(x_train1)), 5):  # Randomly select 5 indices
    print(f"Original Sentence 1: {train_data[i][0]}")
    print(f"Preprocessed Sentence 1: {x_train1[i]}")
    print()

print("Preprocessed x_train2 Samples:")
for i in random.sample(range(len(x_train2)), 5):  # Randomly select 5 indices
    print(f"Original Sentence 2: {train_data[i][1]}")
    print(f"Preprocessed Sentence 2: {x_train2[i]}")
    print()

Preprocessed x_train1 Samples:
Original Sentence 1: A data structure in c plus plus which is a collection of data that is kept in order First in first out
Preprocessed Sentence 1: data structure c plus plus collection data kept order first first

Original Sentence 1: Data members - LRB - attributes - RRB - and member functions
Preprocessed Sentence 1: data members lrb attributes rrb member functions

Original Sentence 1: enqueue which adds data to the queue and dequeue which deletes data from the queue
Preprocessed Sentence 1: enqueue adds data queue dequeue deletes data queue

Original Sentence 1: You implement a list in which the head pointer points to the element most recently pushed onto the list and the pop function changes the head pointer to point to the next to last element in the list and removes the element head pointer previously pointed to
Preprocessed Sentence 1: implement list head pointer points element recently pushed onto list pop function changes head pointer point ne

In [9]:
def build_domain_specific_word_embedding(sentences, method="w2v", epochs=30):
    tokenized_sentences = [sentence.split() for sentence in sentences]
    
    if method == "w2v":
        save_path = "new-param-domain_w2v.model"
        model = Word2Vec(
            vector_size=200,
            window=4,
            min_count=1,
            workers=4,
            sg=1,  # CBOW (0), set to 1 for Skip-Gram
            sample=6e-5,
            alpha=0.03,
            min_alpha=0.0007,
            negative=15
        )
    elif method == "fast":
        save_path = "new-param-domain_fasttext.model"  # Different filename for FastText
        model = FastText(
            vector_size=300,
            window=3,
            min_count=1,
            workers=4
        )
    else:
        raise ValueError("Unsupported embedding method.")

    print(f"Building vocabulary with {len(tokenized_sentences)} sentences")
    model.build_vocab(tokenized_sentences)

    print(f"Training the model for {epochs} epochs")
    model.train(
        tokenized_sentences,
        total_examples=model.corpus_count,
        epochs=epochs
    )

    model.init_sims(replace=True)
    model.save(save_path)
    print(f"Model saved at {save_path}")
    
    return model

def load_pretrained_word_embedding(load_path="new-param-domain_w2v.model"):
    return Word2Vec.load(load_path)

In [10]:
# Define your custom cosine similarity function
def cosine_similarity_custom(vec1, vec2):
    """
    Custom implementation of cosine similarity.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:  # Handle zero-vector case
        return 0.0
    
    return dot_product / (norm_vec1 * norm_vec2)

In [22]:
def average_cosine_similarity(kalimat1, kalimat2, embedding, method="w2v"):
    """
    Generate sentence features using Word2Vec, FastText.
    """
    if method in ["w2v", "fast"]:
        similarities = []
        for text1, text2 in zip(kalimat1, kalimat2):
            tokens1 = [word for word in text1.split() if word in embedding.wv]
            tokens2 = [word for word in text2.split() if word in embedding.wv]
            
            if not tokens1 or not tokens2:
                similarities.append(0)
                continue
            
            vec1 = np.mean([embedding.wv[word] for word in tokens1], axis=0).reshape(1, -1)
            vec2 = np.mean([embedding.wv[word] for word in tokens2], axis=0).reshape(1, -1)
            
            similarities.append(cosine_similarity_custom(vec1.flatten(), vec2.flatten()))
        return np.array(similarities).reshape(-1, 1)

    else:
        raise ValueError("Unsupported method.")

In [12]:
def word_distance(text_group1, text_group2, embed_model):
    similarity_scores = []
    for group1, group2 in zip(text_group1, text_group2):
        group1_tokens = [word for word in group1.split() if word in embed_model.wv]
        group2_tokens = [word for word in group2.split() if word in embed_model.wv]

        if not group1_tokens or not group2_tokens:
            similarity_scores.append(0)
        else:
            similarity_scores.append(-embed_model.wv.wmdistance(group1_tokens, group2_tokens))

    return np.array(similarity_scores).reshape(-1, 1)

In [14]:
# for SIF
from sklearn.decomposition import TruncatedSVD
    
def eliminate_first_component(matrix):
    svd_model = TruncatedSVD(n_components=1, random_state=42)
    svd_model.fit(matrix)
    principal_component = svd_model.components_
    return matrix - matrix.dot(principal_component.T) * principal_component
    
def sif_cos(text_group1, text_group2, embed_model, frequency_map, smoothing_factor=0.001):
    freq_sum = sum(frequency_map.values())
    all_embeddings = []

    for group1, group2 in zip(text_group1, text_group2):
        tokens1 = [word for word in group1.split() if word in embed_model.wv]
        tokens2 = [word for word in group2.split() if word in embed_model.wv]

        if not tokens1 or not tokens2:
            all_embeddings.extend([np.zeros(embed_model.vector_size), np.zeros(embed_model.vector_size)])
            continue

        weights1 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens1]
        weights2 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens2]

        embedding1 = np.average([embed_model.wv[word] for word in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([embed_model.wv[word] for word in tokens2], axis=0, weights=weights2)

        all_embeddings.extend([embedding1, embedding2])

    all_embeddings = np.array(all_embeddings)
    all_embeddings = eliminate_first_component(all_embeddings)

    similarities = [
        (
            np.dot(all_embeddings[i], all_embeddings[i + 1]) /
            (np.linalg.norm(all_embeddings[i]) * np.linalg.norm(all_embeddings[i + 1]))
            if np.linalg.norm(all_embeddings[i]) > 0 and np.linalg.norm(all_embeddings[i + 1]) > 0 else 0
        )
        for i in range(0, len(all_embeddings), 2)
    ]

    return np.array(similarities).reshape(-1, 1)

In [15]:

# ====== Regression Model ======
class RegressionModel:
    def __init__(self, model_type="linear"):
        if model_type == "linear":
            self.model = self.LinearRegressionCustom()
        elif model_type == "svr":
            self.model = SVR(kernel="linear")
        elif model_type == "rfr":
            self.model = self.RandomForestCustom()
        else:
            raise ValueError("Unsupported model type.")

    class LinearRegressionCustom:
        def __init__(self):
            self.weights = None

        def fit(self, X, y):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y
        def fit(self, X, y):
            # Convert y to NumPy and ensure matching rows
            y = np.array(y)
            if X.shape[0] != y.shape[0]:
                raise ValueError(f"Shape mismatch: X has {X.shape[0]} rows but y has {y.shape[0]} rows.")
            
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y

        def predict(self, X):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            return X @ self.weights

    class RandomForestCustom:
        def __init__(self, n_estimators=100, max_depth=None):
            self.n_estimators = n_estimators
            self.max_depth = max_depth
            self.trees = []
    
        def fit(self, X, y):
            from sklearn.tree import DecisionTreeRegressor
    
            # Ensure y is a NumPy array
            y = np.array(y)
    
            n_samples = X.shape[0]
    
            for _ in range(self.n_estimators):
                # Ensure indices are integers for proper indexing
                indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
                X_sample = X[indices]
                y_sample = y[indices]
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X_sample, y_sample)
                self.trees.append(tree)
    
        def predict(self, X):
            # Aggregate predictions from all trees
            predictions = np.array([tree.predict(X) for tree in self.trees])
            return np.mean(predictions, axis=0)

    @staticmethod
    def mean_squared_error(y_true, y_pred):
        squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
        return sum(squared_errors) / len(squared_errors)

    @staticmethod
    def mean_absolute_error(y_true, y_pred):
        absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
        return sum(absolute_errors) / len(absolute_errors)

    @staticmethod
    def pearsonr(x, y):
        mean_x = sum(x) / len(x)
        mean_y = sum(y) / len(y)
        numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
        denominator = ((sum((xi - mean_x) ** 2 for xi in x) * sum((yi - mean_y) ** 2 for yi in y)) ** 0.5)
        return (numerator / denominator if denominator != 0 else 0.0, None)

    def train(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def evaluate(self, x, y):
        predictions = self.model.predict(x)
        mse = self.mean_squared_error(y, predictions)
        mae = self.mean_absolute_error(y, predictions)
        pearson_corr, _ = self.pearsonr(y, predictions)
        return mse, mae, pearson_corr

    def predict(self, x):
        return self.model.predict(x)

In [16]:
def feature_extraction(train_set1, train_set2, val_set1, val_set2, test_set, embed_model, frequency_map, method):
    if method == "averageCosine":
        train_similarities = average_cosine_similarity(train_set1, train_set2, embed_model)
        val_similarities = average_cosine_similarity(val_set1, val_set2, embed_model)
        test_similarities = average_cosine_similarity(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "wordDis":
        train_similarities = word_distance(train_set1, train_set2, embed_model)
        val_similarities = word_distance(val_set1, val_set2, embed_model)
        test_similarities = word_distance(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "sifCos":
        train_similarities = sif_cos(train_set1, train_set2, embed_model, frequency_map)
        val_similarities = sif_cos(val_set1, val_set2, embed_model, frequency_map)
        test_similarities = sif_cos(test_set["sentence1"], test_set["sentence2"], embed_model, frequency_map)
    else:
        raise ValueError(f"Feature extraction method '{method}' is not supported.")

    return np.array(train_similarities), np.array(val_similarities), np.array(test_similarities)

In [17]:
# ====== Testing Predictions ======
def print_test_predictions(model, x_test_features, x_test1, x_test2, y_test):
    predictions = model.predict(x_test_features)
    true_scores = np.array(y_test) * 5.0
    predicted_scores = predictions * 5.0
    pearson_corr, _ = RegressionModel.pearsonr(true_scores, predicted_scores)
    results = pd.DataFrame({
        "Sentence 1": x_test1,
        "Sentence 2": x_test2,
        "True Similarity Score": true_scores,
        "Predicted Similarity Score": predicted_scores
    })
    print(results.head(10))
    print(f"Pearson Correlation: {pearson_corr:.4f}")
    results.to_csv("train_predictions.csv", index=False)

In [18]:
pip install cvxopt


Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install POT


Collecting POT
  Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (865 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.6/865.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.5
Note: you may need to restart the kernel to use updated packages.


In [25]:
import pandas as pd

# ====== Frequency Computation for SIF ======
from collections import Counter
all_sentences = x_train1 + x_train2 + x_valid1 + x_valid2 + x_test1 + x_test2
# Compute word frequencies for SIF
all_tokens = [token for sentence in all_sentences for token in sentence.split()]
freqs = Counter(all_tokens)

# ====== Feature Extraction and Model Evaluation ======
methods = ["averageCosine", "wordDis", "sifCos"]
embedding_methods = {"w2v": "Word2Vec", "fast": "FastText"}  # Map method names to embeddings

# Store results for all methods and models
results = []

# Loop over embedding methods and feature extraction methods
for embedding_type in ["w2v", "fast"]:
    print(f"Building embedding model: {embedding_methods[embedding_type]}")
    embedding = build_domain_specific_word_embedding(all_sentences, method=embedding_type)

    for method in methods:
        current_method = method
        print(f"Using feature extraction method: {current_method}")
        
        x_train_features, x_valid_features, x_test_features = feature_extraction(
            x_train1, x_train2, x_valid1, x_valid2, {"sentence1": x_test1, "sentence2": x_test2}, embedding, freqs, current_method
        )

        for reg_model in ["linear", "svr", "rfr"]:
            model = RegressionModel(model_type=reg_model)
            model.train(x_train_features, y_train)

            # Evaluate on validation and test sets
            val_mse, val_mae, val_pearson = model.evaluate(x_valid_features, y_valid)
            test_mse, test_mae, test_pearson = model.evaluate(x_test_features, y_test)

            print(f"Validation Performance ({embedding_type}, {current_method}, {reg_model}):")
            print(f"MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, Pearson Correlation: {val_pearson:.4f}")

            print(f"Test Performance ({embedding_type}, {current_method}, {reg_model}):")
            print(f"MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, Pearson Correlation: {test_pearson:.4f}")

            # Store results
            results.append({
                "embedding_type": embedding_type,
                "method": current_method,
                "model": reg_model,
                "pearson": test_pearson,
                "test_predictions": model.predict(x_test_features),
                "test_features": x_test_features,
                "x_test1": x_test1,
                "x_test2": x_test2,
                "y_test": y_test
            })

# Save all results as CSV files (for both validation and test)
for i, result in enumerate(results, start=1):
    # Retrieve original raw sentences for validation and test
    raw_valid1 = [item[0] for item in valid_data]  # Original raw Sentence 1 for validation
    raw_valid2 = [item[1] for item in valid_data]  # Original raw Sentence 2 for validation
    raw_test1 = [item[0] for item in test_data]  # Original raw Sentence 1 for test
    raw_test2 = [item[1] for item in test_data]  # Original raw Sentence 2 for test
    val_predictions = model.predict(x_valid_features)  # Prediksi untuk data validasi
    # Create DataFrame for validation predictions
    val_df = pd.DataFrame({
        "Original Sentence 1": raw_valid1,  # Append raw sentence 1
        "Original Sentence 2": raw_valid2,  # Append raw sentence 2
        "Preprocessed Sentence 1": x_valid1,  # Preprocessed validation sentence 1
        "Preprocessed Sentence 2": x_valid2,  # Preprocessed validation sentence 2
        "True Similarity Score": [y * 5 for y in y_valid],  # Rescale validation true scores
        "Predicted Similarity Score": [y * 5 for y in val_predictions]  # Validation predictions rescaled
    })

    # Save validation result CSV
    val_filename = f"val_result_{i}_{result['embedding_type']}_{result['method']}_{result['model']}.csv"
    val_df.to_csv(val_filename, index=False)
    print(f"Saved validation result: {val_filename}")

    # Create DataFrame for test predictions
    test_df = pd.DataFrame({
        "Original Sentence 1": raw_test1,  # Append raw sentence 1
        "Original Sentence 2": raw_test2,  # Append raw sentence 2
        "Preprocessed Sentence 1": result["x_test1"],  # Preprocessed test sentence 1
        "Preprocessed Sentence 2": result["x_test2"],  # Preprocessed test sentence 2
        "True Similarity Score": [y * 5 for y in result["y_test"]],  # Rescale to [0, 5]
        "Predicted Similarity Score": [y * 5 for y in result["test_predictions"]]  # Predicted scores rescaled
    })

    # Save test result CSV
    test_filename = f"test_result_{i}_{result['embedding_type']}_{result['method']}_{result['model']}.csv"
    test_df.to_csv(test_filename, index=False)
    print(f"Saved test result: {test_filename}")


Building embedding model: Word2Vec
Building vocabulary with 7292 sentences
Training the model for 30 epochs


  model.init_sims(replace=True)


Model saved at new-param-domain_w2v.model
Using feature extraction method: averageCosine
Validation Performance (w2v, averageCosine, linear):
MSE: 0.0420, MAE: 0.1631, Pearson Correlation: 0.4196
Test Performance (w2v, averageCosine, linear):
MSE: 0.0404, MAE: 0.1640, Pearson Correlation: 0.5256
Validation Performance (w2v, averageCosine, svr):
MSE: 0.0417, MAE: 0.1566, Pearson Correlation: 0.4196
Test Performance (w2v, averageCosine, svr):
MSE: 0.0408, MAE: 0.1592, Pearson Correlation: 0.5256
Validation Performance (w2v, averageCosine, rfr):
MSE: 0.0513, MAE: 0.1696, Pearson Correlation: 0.4024
Test Performance (w2v, averageCosine, rfr):
MSE: 0.0545, MAE: 0.1751, Pearson Correlation: 0.4006
Using feature extraction method: wordDis
Validation Performance (w2v, wordDis, linear):
MSE: 0.0438, MAE: 0.1651, Pearson Correlation: 0.3796
Test Performance (w2v, wordDis, linear):
MSE: 0.0445, MAE: 0.1641, Pearson Correlation: 0.4436
Validation Performance (w2v, wordDis, svr):
MSE: 0.0436, MAE: 

  model.init_sims(replace=True)


Model saved at new-param-domain_fasttext.model
Using feature extraction method: averageCosine
Validation Performance (fast, averageCosine, linear):
MSE: 0.0432, MAE: 0.1626, Pearson Correlation: 0.4022
Test Performance (fast, averageCosine, linear):
MSE: 0.0406, MAE: 0.1601, Pearson Correlation: 0.5187
Validation Performance (fast, averageCosine, svr):
MSE: 0.0424, MAE: 0.1585, Pearson Correlation: 0.4022
Test Performance (fast, averageCosine, svr):
MSE: 0.0416, MAE: 0.1587, Pearson Correlation: 0.5187
Validation Performance (fast, averageCosine, rfr):
MSE: 0.0519, MAE: 0.1684, Pearson Correlation: 0.4064
Test Performance (fast, averageCosine, rfr):
MSE: 0.0536, MAE: 0.1746, Pearson Correlation: 0.4162
Using feature extraction method: wordDis
Validation Performance (fast, wordDis, linear):
MSE: 0.0434, MAE: 0.1665, Pearson Correlation: 0.3901
Test Performance (fast, wordDis, linear):
MSE: 0.0458, MAE: 0.1672, Pearson Correlation: 0.4165
Validation Performance (fast, wordDis, svr):
MSE: