In [1]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import gensim
import nltk
import numpy as np
import pandas as pd
import random
from gensim.models import Word2Vec, KeyedVectors, FastText
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.decomposition import TruncatedSVD

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import re
from nltk.tokenize import word_tokenize

def preprocess_text_indo(text):
    # Normalize text (convert to lowercase)
    text = text.lower()

    # Remove new lines and extra spaces
    text = re.sub(r"\s+", " ", text.strip())

    # Remove punctuation using regex
    text = re.sub(r"[^\w\s]", "", text)  # Removes all punctuation but keeps words and spaces

    # Tokenize text
    tokens = word_tokenize(text)

    return " ".join(tokens)

In [5]:
# ====== Dataset Loading and Splitting ======
def load_indo_dataset(filename):
    """Load dataset and convert it into a list of (answer, response, label)."""
    df = pd.read_csv(filename)
    data = [(row['answer'], row['response'], row['label'] / 5.0) for _, row in df.iterrows()]
    return data

In [6]:
def split_dataset(data, valid_percentage=0.1, test_percentage=0.1):
    """Split the dataset into training, validation, and test sets."""
    length = len(data)
    random.shuffle(data)
    train = data[:int(length * (1 - valid_percentage - test_percentage))]
    valid = data[int(length * (1 - valid_percentage - test_percentage)):int(length * (1 - test_percentage))]
    test = data[int(length * (1 - test_percentage)):]
    return train, valid, test


In [7]:
def preprocess_data(data):
    """Preprocess the data and return cleaned sentences and labels."""
    sentence1 = [preprocess_text_indo(item[0]) for item in data]
    sentence2 = [preprocess_text_indo(item[1]) for item in data]
    labels = [item[2] for item in data]
    return sentence1, sentence2, labels

In [8]:
def build_domain_specific_word_embedding(sentences, method="w2v", epochs=30):
    tokenized_sentences = [sentence.split() for sentence in sentences]
    
    if method == "w2v":
        save_path = "id-domain_w2v.model"
        model = Word2Vec(
            vector_size=200,
            window=4,
            min_count=1,
            workers=4,
            sg=1,  # CBOW (0), set to 1 for Skip-Gram
            sample=6e-5,
            alpha=0.03,
            min_alpha=0.0007,
            negative=15
        )
    elif method == "fast":
        save_path = "id-domain_fasttext.model"  # Different filename for FastText
        model = FastText(
            vector_size=300,
            window=3,
            min_count=1,
            workers=4
        )
    else:
        raise ValueError("Unsupported embedding method.")

    print(f"Building vocabulary with {len(tokenized_sentences)} sentences")
    model.build_vocab(tokenized_sentences)

    print(f"Training the model for {epochs} epochs")
    model.train(
        tokenized_sentences,
        total_examples=model.corpus_count,
        epochs=epochs
    )

    model.init_sims(replace=True)
    model.save(save_path)
    print(f"Model saved at {save_path}")
    
    return model

def load_pretrained_word_embedding(load_path="new-param-domain_w2v.model"):
    return Word2Vec.load(load_path)

In [9]:
# Define your custom cosine similarity function
def cosine_similarity_custom(vec1, vec2):
    """
    Custom implementation of cosine similarity.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:  # Handle zero-vector case
        return 0.0
    
    return dot_product / (norm_vec1 * norm_vec2)

In [10]:
def average_cosine_similarity(kalimat1, kalimat2, embedding, method="w2v"):
    """
    Generate sentence features using Word2Vec, FastText.
    """
    if method in ["w2v", "fast"]:
        similarities = []
        for text1, text2 in zip(kalimat1, kalimat2):
            tokens1 = [word for word in text1.split() if word in embedding.wv]
            tokens2 = [word for word in text2.split() if word in embedding.wv]
            
            if not tokens1 or not tokens2:
                similarities.append(0)
                continue
            
            vec1 = np.mean([embedding.wv[word] for word in tokens1], axis=0).reshape(1, -1)
            vec2 = np.mean([embedding.wv[word] for word in tokens2], axis=0).reshape(1, -1)
            
            similarities.append(cosine_similarity_custom(vec1.flatten(), vec2.flatten()))
        return np.array(similarities).reshape(-1, 1)

    else:
        raise ValueError("Unsupported method.")

In [11]:
def word_distance(text_group1, text_group2, embed_model):
    similarity_scores = []
    for group1, group2 in zip(text_group1, text_group2):
        group1_tokens = [word for word in group1.split() if word in embed_model.wv]
        group2_tokens = [word for word in group2.split() if word in embed_model.wv]

        if not group1_tokens or not group2_tokens:
            similarity_scores.append(0)
        else:
            similarity_scores.append(-embed_model.wv.wmdistance(group1_tokens, group2_tokens))

    return np.array(similarity_scores).reshape(-1, 1)

In [12]:
# for SIF
from sklearn.decomposition import TruncatedSVD
    
def eliminate_first_component(matrix):
    svd_model = TruncatedSVD(n_components=1, random_state=42)
    svd_model.fit(matrix)
    principal_component = svd_model.components_
    return matrix - matrix.dot(principal_component.T) * principal_component
    
def sif_cos(text_group1, text_group2, embed_model, frequency_map, smoothing_factor=0.001):
    freq_sum = sum(frequency_map.values())
    all_embeddings = []

    for group1, group2 in zip(text_group1, text_group2):
        tokens1 = [word for word in group1.split() if word in embed_model.wv]
        tokens2 = [word for word in group2.split() if word in embed_model.wv]

        if not tokens1 or not tokens2:
            all_embeddings.extend([np.zeros(embed_model.vector_size), np.zeros(embed_model.vector_size)])
            continue

        weights1 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens1]
        weights2 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens2]

        embedding1 = np.average([embed_model.wv[word] for word in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([embed_model.wv[word] for word in tokens2], axis=0, weights=weights2)

        all_embeddings.extend([embedding1, embedding2])

    all_embeddings = np.array(all_embeddings)
    all_embeddings = eliminate_first_component(all_embeddings)

    similarities = [
        (
            np.dot(all_embeddings[i], all_embeddings[i + 1]) /
            (np.linalg.norm(all_embeddings[i]) * np.linalg.norm(all_embeddings[i + 1]))
            if np.linalg.norm(all_embeddings[i]) > 0 and np.linalg.norm(all_embeddings[i + 1]) > 0 else 0
        )
        for i in range(0, len(all_embeddings), 2)
    ]

    return np.array(similarities).reshape(-1, 1)

In [13]:
def feature_extraction(train_set1, train_set2, val_set1, val_set2, test_set, embed_model, frequency_map, method):
    if method == "averageCosine":
        train_similarities = average_cosine_similarity(train_set1, train_set2, embed_model)
        val_similarities = average_cosine_similarity(val_set1, val_set2, embed_model)
        test_similarities = average_cosine_similarity(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "wordDis":
        train_similarities = word_distance(train_set1, train_set2, embed_model)
        val_similarities = word_distance(val_set1, val_set2, embed_model)
        test_similarities = word_distance(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "sifCos":
        train_similarities = sif_cos(train_set1, train_set2, embed_model, frequency_map)
        val_similarities = sif_cos(val_set1, val_set2, embed_model, frequency_map)
        test_similarities = sif_cos(test_set["sentence1"], test_set["sentence2"], embed_model, frequency_map)
    else:
        raise ValueError(f"Feature extraction method '{method}' is not supported.")

    return np.array(train_similarities), np.array(val_similarities), np.array(test_similarities)

In [14]:

# ====== Regression Model ======
class RegressionModel:
    def __init__(self, model_type="linear"):
        if model_type == "linear":
            self.model = self.LinearRegressionCustom()
        elif model_type == "svr":
            self.model = SVR(kernel="linear")
        elif model_type == "rfr":
            self.model = self.RandomForestCustom()
        else:
            raise ValueError("Unsupported model type.")

    class LinearRegressionCustom:
        def __init__(self):
            self.weights = None

        def fit(self, X, y):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y
        def fit(self, X, y):
            # Convert y to NumPy and ensure matching rows
            y = np.array(y)
            if X.shape[0] != y.shape[0]:
                raise ValueError(f"Shape mismatch: X has {X.shape[0]} rows but y has {y.shape[0]} rows.")
            
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y

        def predict(self, X):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            return X @ self.weights

    class RandomForestCustom:
        def __init__(self, n_estimators=100, max_depth=None):
            self.n_estimators = n_estimators
            self.max_depth = max_depth
            self.trees = []
    
        def fit(self, X, y):
            from sklearn.tree import DecisionTreeRegressor
    
            # Ensure y is a NumPy array
            y = np.array(y)
    
            n_samples = X.shape[0]
    
            for _ in range(self.n_estimators):
                # Ensure indices are integers for proper indexing
                indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
                X_sample = X[indices]
                y_sample = y[indices]
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X_sample, y_sample)
                self.trees.append(tree)
    
        def predict(self, X):
            # Aggregate predictions from all trees
            predictions = np.array([tree.predict(X) for tree in self.trees])
            return np.mean(predictions, axis=0)

    @staticmethod
    def mean_squared_error(y_true, y_pred):
        squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
        return sum(squared_errors) / len(squared_errors)

    @staticmethod
    def mean_absolute_error(y_true, y_pred):
        absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
        return sum(absolute_errors) / len(absolute_errors)

    @staticmethod
    def pearsonr(x, y):
        mean_x = sum(x) / len(x)
        mean_y = sum(y) / len(y)
        numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
        denominator = ((sum((xi - mean_x) ** 2 for xi in x) * sum((yi - mean_y) ** 2 for yi in y)) ** 0.5)
        return (numerator / denominator if denominator != 0 else 0.0, None)

    def train(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def evaluate(self, x, y):
        predictions = self.model.predict(x)
        mse = self.mean_squared_error(y, predictions)
        mae = self.mean_absolute_error(y, predictions)
        pearson_corr, _ = self.pearsonr(y, predictions)
        return mse, mae, pearson_corr

    def predict(self, x):
        return self.model.predict(x)

In [15]:
pip install cvxopt

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install POT


Collecting POT
  Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (865 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.6/865.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.5
Note: you may need to restart the kernel to use updated packages.


In [17]:
import random
# ====== Main Workflow ======
# Load and preprocess dataset
# Load Indonesian dataset
raw_data = load_indo_dataset("/kaggle/input/indo-datanew/indodataset.csv")
train_data, valid_data, test_data = split_dataset(raw_data)
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

In [18]:
# ====== Load Dataset ======
# Load train, validation, and test datasets from CSV files
train_file = "/kaggle/input/data-w2v/train_data-w2v.csv"
valid_file = "/kaggle/input/data-w2v/valid_data-w2v.csv"
test_file = "/kaggle/input/data-w2v/test_data-w2v.csv"

# Read datasets
train_data = pd.read_csv(train_file).values
valid_data = pd.read_csv(valid_file).values
test_data = pd.read_csv(test_file).values

# ====== Preprocessing ======
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

# Output shapes for verification
print(f"Train data: {len(x_train1)} pairs, {len(y_train)} labels")
print(f"Validation data: {len(x_valid1)} pairs, {len(y_valid)} labels")
print(f"Test data: {len(x_test1)} pairs, {len(y_test)} labels")


Train data: 1476 pairs, 1476 labels
Validation data: 184 pairs, 184 labels
Test data: 185 pairs, 185 labels


In [19]:
import pandas as pd

# ====== Frequency Computation for SIF ======
from collections import Counter
all_sentences = x_train1 + x_train2 + x_valid1 + x_valid2 + x_test1 + x_test2
# Compute word frequencies for SIF
all_tokens = [token for sentence in all_sentences for token in sentence.split()]
freqs = Counter(all_tokens)

# ====== Feature Extraction and Model Evaluation ======
methods = ["averageCosine", "wordDis", "sifCos"]
embedding_methods = {"w2v": "Word2Vec", "fast": "FastText"}  # Map method names to embeddings

# Store results for all methods and models
results = []

# Loop over embedding methods and feature extraction methods
for embedding_type in ["w2v", "fast"]:
    print(f"Building embedding model: {embedding_methods[embedding_type]}")
    embedding = build_domain_specific_word_embedding(all_sentences, method=embedding_type)

    for method in methods:
        current_method = method
        print(f"Using feature extraction method: {current_method}")
        
        x_train_features, x_valid_features, x_test_features = feature_extraction(
            x_train1, x_train2, x_valid1, x_valid2, {"sentence1": x_test1, "sentence2": x_test2}, embedding, freqs, current_method
        )

        for reg_model in ["linear", "svr", "rfr"]:
            model = RegressionModel(model_type=reg_model)
            model.train(x_train_features, y_train)

            # Evaluate on validation and test sets
            val_mse, val_mae, val_pearson = model.evaluate(x_valid_features, y_valid)
            test_mse, test_mae, test_pearson = model.evaluate(x_test_features, y_test)

            print(f"Validation Performance ({embedding_type}, {current_method}, {reg_model}):")
            print(f"MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, Pearson Correlation: {val_pearson:.4f}")

            print(f"Test Performance ({embedding_type}, {current_method}, {reg_model}):")
            print(f"MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, Pearson Correlation: {test_pearson:.4f}")

            # Store results
            results.append({
                "embedding_type": embedding_type,
                "method": current_method,
                "model": reg_model,
                "pearson": test_pearson,
                "test_predictions": model.predict(x_test_features),
                "test_features": x_test_features,
                "x_test1": x_test1,
                "x_test2": x_test2,
                "y_test": y_test
            })

# Save all results as CSV files (for both validation and test)
for i, result in enumerate(results, start=1):
    # Retrieve original raw sentences for validation and test
    raw_valid1 = [item[0] for item in valid_data]  # Original raw Sentence 1 for validation
    raw_valid2 = [item[1] for item in valid_data]  # Original raw Sentence 2 for validation
    raw_test1 = [item[0] for item in test_data]  # Original raw Sentence 1 for test
    raw_test2 = [item[1] for item in test_data]  # Original raw Sentence 2 for test
    val_predictions = model.predict(x_valid_features)  # Prediksi untuk data validasi
    # Create DataFrame for validation predictions
    val_df = pd.DataFrame({
        "Original Sentence 1": raw_valid1,  # Append raw sentence 1
        "Original Sentence 2": raw_valid2,  # Append raw sentence 2
        "Preprocessed Sentence 1": x_valid1,  # Preprocessed validation sentence 1
        "Preprocessed Sentence 2": x_valid2,  # Preprocessed validation sentence 2
        "True Similarity Score": [y * 5 for y in y_valid],  # Rescale validation true scores
        "Predicted Similarity Score": [y * 5 for y in val_predictions]  # Validation predictions rescaled
    })

    # Save validation result CSV
    val_filename = f"val_result_{i}_{result['embedding_type']}_{result['method']}_{result['model']}.csv"
    val_df.to_csv(val_filename, index=False)
    print(f"Saved validation result: {val_filename}")

    # Create DataFrame for test predictions
    test_df = pd.DataFrame({
        "Original Sentence 1": raw_test1,  # Append raw sentence 1
        "Original Sentence 2": raw_test2,  # Append raw sentence 2
        "Preprocessed Sentence 1": result["x_test1"],  # Preprocessed test sentence 1
        "Preprocessed Sentence 2": result["x_test2"],  # Preprocessed test sentence 2
        "True Similarity Score": [y * 5 for y in result["y_test"]],  # Rescale to [0, 5]
        "Predicted Similarity Score": [y * 5 for y in result["test_predictions"]]  # Predicted scores rescaled
    })

    # Save test result CSV
    test_filename = f"test_result_{i}_{result['embedding_type']}_{result['method']}_{result['model']}.csv"
    test_df.to_csv(test_filename, index=False)
    print(f"Saved test result: {test_filename}")


Building embedding model: Word2Vec
Building vocabulary with 3690 sentences
Training the model for 30 epochs


  model.init_sims(replace=True)


Model saved at id-domain_w2v.model
Using feature extraction method: averageCosine
Validation Performance (w2v, averageCosine, linear):
MSE: 0.0905, MAE: 0.2585, Pearson Correlation: 0.6282
Test Performance (w2v, averageCosine, linear):
MSE: 0.0743, MAE: 0.2386, Pearson Correlation: 0.7307
Validation Performance (w2v, averageCosine, svr):
MSE: 0.0977, MAE: 0.2491, Pearson Correlation: 0.6282
Test Performance (w2v, averageCosine, svr):
MSE: 0.0694, MAE: 0.2186, Pearson Correlation: 0.7307
Validation Performance (w2v, averageCosine, rfr):
MSE: 0.0941, MAE: 0.2212, Pearson Correlation: 0.6442
Test Performance (w2v, averageCosine, rfr):
MSE: 0.0705, MAE: 0.1807, Pearson Correlation: 0.7216
Using feature extraction method: wordDis
Validation Performance (w2v, wordDis, linear):
MSE: 0.0625, MAE: 0.1945, Pearson Correlation: 0.7637
Test Performance (w2v, wordDis, linear):
MSE: 0.0494, MAE: 0.1709, Pearson Correlation: 0.8136
Validation Performance (w2v, wordDis, svr):
MSE: 0.0625, MAE: 0.1897,

  model.init_sims(replace=True)


Model saved at id-domain_fasttext.model
Using feature extraction method: averageCosine
Validation Performance (fast, averageCosine, linear):
MSE: 0.0663, MAE: 0.2171, Pearson Correlation: 0.7528
Test Performance (fast, averageCosine, linear):
MSE: 0.0586, MAE: 0.2000, Pearson Correlation: 0.7737
Validation Performance (fast, averageCosine, svr):
MSE: 0.0674, MAE: 0.2083, Pearson Correlation: 0.7528
Test Performance (fast, averageCosine, svr):
MSE: 0.0600, MAE: 0.1935, Pearson Correlation: 0.7737
Validation Performance (fast, averageCosine, rfr):
MSE: 0.0800, MAE: 0.1984, Pearson Correlation: 0.7063
Test Performance (fast, averageCosine, rfr):
MSE: 0.0621, MAE: 0.1625, Pearson Correlation: 0.7638
Using feature extraction method: wordDis
Validation Performance (fast, wordDis, linear):
MSE: 0.0542, MAE: 0.1777, Pearson Correlation: 0.7995
Test Performance (fast, wordDis, linear):
MSE: 0.0428, MAE: 0.1589, Pearson Correlation: 0.8371
Validation Performance (fast, wordDis, svr):
MSE: 0.0541

In [20]:
class KeyedVectorsWrapper:
    """Wrapper for KeyedVectors to provide a .wv attribute."""
    def __init__(self, keyed_vectors):
        self.wv = keyed_vectors


In [21]:
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
# ====== Function to Load and Preprocess Unseen Dataset ======
def load_unseen_indo_dataset(filename):
    """Load and preprocess new unseen Indonesian dataset."""
    df = pd.read_csv(filename)
    data = [(row['answer'], row['response']) for _, row in df.iterrows()]  # No labels in unseen data
    return data

def preprocess_unseen_data(data):
    """Preprocess unseen data for predictions."""
    sentence1 = [preprocess_text_indo(item[0]) for item in data]  # Preprocess first sentence
    sentence2 = [preprocess_text_indo(item[1]) for item in data]  # Preprocess second sentence
    return sentence1, sentence2

# ====== Load Saved Embedding Model ======
def load_saved_embedding_model(model_path, method):
    """Load saved Word2Vec or FastText model and handle KeyedVectors compatibility."""
    if method == "w2v":
        print(f"Loading saved Word2Vec model from {model_path}...")
        embedding = Word2Vec.load(model_path)
    elif method == "fast":
        print(f"Loading saved FastText model from {model_path}...")
        embedding = KeyedVectors.load_word2vec_format(model_path, binary=True)
    else:
        raise ValueError("Unsupported embedding method.")
    print(f"Model loaded successfully!")
    return embedding

# ====== Compute Evaluation Metrics ======
def compute_evaluation_metrics(y_true, y_pred):
    """Compute MSE, MAE, and Pearson Correlation."""
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    pearson_corr, _ = pearsonr(y_true, y_pred) if len(y_true) > 1 else (0, None)
    return mse, mae, pearson_corr
    
# ====== Corrected Predict Unseen Data Function ======
def predict_unseen_data(model, embedding, unseen_file_path, output_file):
    """
    Predict similarity scores on unseen dataset, save results, and compute evaluation metrics.
    
    Parameters:
        model: Trained regression model.
        embedding: Trained embedding model (Word2Vec or FastText).
        unseen_file_path: Path to the unseen dataset CSV.
        output_file: Path to save the results.
    """
    # Load unseen dataset
    unseen_data = pd.read_csv(unseen_file_path)
    
    # Extract true labels
    if "label" in unseen_data.columns:
        y_true = unseen_data["label"].values / 5.0  # Normalize true labels to 0-1
    else:
        y_true = None

    # Preprocess unseen dataset
    unseen_sentence1 = [preprocess_text_indo(item) for item in unseen_data["answer"]]
    unseen_sentence2 = [preprocess_text_indo(item) for item in unseen_data["response"]]
    
    # Extract features using the embedding
    unseen_features = average_cosine_similarity(unseen_sentence1, unseen_sentence2, embedding)  # Pass embedding directly
    
    # Make predictions
    predictions = model.predict(unseen_features)
    
    # Save predictions to CSV
    df = pd.DataFrame({
        "Original Sentence 1": unseen_data["answer"],
        "Original Sentence 2": unseen_data["response"],
        "Predicted Similarity Score": predictions * 5  # Rescale the scores to 0-5
    })

    # Include true labels if available
    if y_true is not None:
        df["True Similarity Score"] = unseen_data["label"]  # Keep original scale (0-5)
    
    df.to_csv(output_file, index=False)
    print(f"Saved predictions for unseen dataset to {output_file}")
    
    # If true scores are provided, compute evaluation metrics
    if y_true is not None:
        y_true_rescaled = [y * 5 for y in y_true]  # Rescale true scores to 0-5
        predictions_rescaled = predictions * 5  # Rescale predictions to 0-5
        mse, mae, pearson_corr = compute_evaluation_metrics(y_true_rescaled, predictions_rescaled)
        print(f"Evaluation Metrics:\n"
              f"Mean Squared Error (MSE): {mse:.4f}\n"
              f"Mean Absolute Error (MAE): {mae:.4f}\n"
              f"Pearson Correlation: {pearson_corr:.4f}")
        return mse, mae, pearson_corr
    else:
        print("True scores not provided. Skipping evaluation metrics.")
        return None, None, None

# ====== Main Workflow for Testing on Unseen Data ======
unseen_file_path = "/kaggle/input/testi-data/test-BuIng.csv"
output_file = "w2v-unseen_test_predictions_with_true_labels.csv"

# Load the saved embedding model
embedding_model_path = "/kaggle/working/id-domain_w2v.model"  # Use the correct path
embedding = load_saved_embedding_model(embedding_model_path, method="w2v")  # Or "fast" for FastText

# Predict and evaluate on the unseen dataset
print("Predicting on unseen dataset and computing evaluation metrics...")
mse, mae, pearson_corr = predict_unseen_data(model, embedding, unseen_file_path, output_file)


Loading saved Word2Vec model from /kaggle/working/id-domain_w2v.model...
Model loaded successfully!
Predicting on unseen dataset and computing evaluation metrics...
Saved predictions for unseen dataset to w2v-unseen_test_predictions_with_true_labels.csv
Evaluation Metrics:
Mean Squared Error (MSE): 2.6178
Mean Absolute Error (MAE): 1.3462
Pearson Correlation: -0.2065


In [22]:
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
# ====== Function to Load and Preprocess Unseen Dataset ======
def load_unseen_indo_dataset(filename):
    """Load and preprocess new unseen Indonesian dataset."""
    df = pd.read_csv(filename)
    data = [(row['answer'], row['response']) for _, row in df.iterrows()]  # No labels in unseen data
    return data

def preprocess_unseen_data(data):
    """Preprocess unseen data for predictions."""
    sentence1 = [preprocess_text_indo(item[0]) for item in data]  # Preprocess first sentence
    sentence2 = [preprocess_text_indo(item[1]) for item in data]  # Preprocess second sentence
    return sentence1, sentence2

# ====== Load Saved Embedding Model ======
def load_saved_embedding_model(model_path, method):
    """Load saved Word2Vec or FastText model and handle KeyedVectors compatibility."""
    if method == "w2v":
        print(f"Loading saved Word2Vec model from {model_path}...")
        embedding = Word2Vec.load(model_path)
    elif method == "fast":
        print(f"Loading saved FastText model from {model_path}...")
        embedding = KeyedVectors.load_word2vec_format(model_path, binary=True)
    else:
        raise ValueError("Unsupported embedding method.")
    print(f"Model loaded successfully!")
    return embedding

# ====== Compute Evaluation Metrics ======
def compute_evaluation_metrics(y_true, y_pred):
    """Compute MSE, MAE, and Pearson Correlation."""
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    pearson_corr, _ = pearsonr(y_true, y_pred) if len(y_true) > 1 else (0, None)
    return mse, mae, pearson_corr
    
# ====== Corrected Predict Unseen Data Function ======
def predict_unseen_data(model, embedding, unseen_file_path, output_file):
    """
    Predict similarity scores on unseen dataset, save results, and compute evaluation metrics.
    
    Parameters:
        model: Trained regression model.
        embedding: Trained embedding model (Word2Vec or FastText).
        unseen_file_path: Path to the unseen dataset CSV.
        output_file: Path to save the results.
    """
    # Load unseen dataset
    unseen_data = pd.read_csv(unseen_file_path)
    
    # Extract true labels
    if "label" in unseen_data.columns:
        y_true = unseen_data["label"].values / 5.0  # Normalize true labels to 0-1
    else:
        y_true = None

    # Preprocess unseen dataset
    unseen_sentence1 = [preprocess_text_indo(item) for item in unseen_data["answer"]]
    unseen_sentence2 = [preprocess_text_indo(item) for item in unseen_data["response"]]
    
    # Extract features using the embedding
    unseen_features = average_cosine_similarity(unseen_sentence1, unseen_sentence2, embedding)  # Pass embedding directly
    
    # Make predictions
    predictions = model.predict(unseen_features)
    
    # Save predictions to CSV
    df = pd.DataFrame({
        "Original Sentence 1": unseen_data["answer"],
        "Original Sentence 2": unseen_data["response"],
        "Predicted Similarity Score": predictions * 5  # Rescale the scores to 0-5
    })

    # Include true labels if available
    if y_true is not None:
        df["True Similarity Score"] = unseen_data["label"]  # Keep original scale (0-5)
    
    df.to_csv(output_file, index=False)
    print(f"Saved predictions for unseen dataset to {output_file}")
    
    # If true scores are provided, compute evaluation metrics
    if y_true is not None:
        y_true_rescaled = [y * 5 for y in y_true]  # Rescale true scores to 0-5
        predictions_rescaled = predictions * 5  # Rescale predictions to 0-5
        mse, mae, pearson_corr = compute_evaluation_metrics(y_true_rescaled, predictions_rescaled)
        print(f"Evaluation Metrics:\n"
              f"Mean Squared Error (MSE): {mse:.4f}\n"
              f"Mean Absolute Error (MAE): {mae:.4f}\n"
              f"Pearson Correlation: {pearson_corr:.4f}")
        return mse, mae, pearson_corr
    else:
        print("True scores not provided. Skipping evaluation metrics.")
        return None, None, None

# ====== Main Workflow for Testing on Unseen Data ======
unseen_file_path = "/kaggle/input/testi-data/test-BuIng.csv"
output_file = "fasttext-unseen_test_predictions_with_true_labels.csv"

# Load the saved embedding model
embedding_model_path = "/kaggle/working/id-domain_fasttext.model"  # Use the correct path
embedding = load_saved_embedding_model(embedding_model_path, method="w2v")  # Or "fast" for FastText

# Predict and evaluate on the unseen dataset
print("Predicting on unseen dataset and computing evaluation metrics...")
mse, mae, pearson_corr = predict_unseen_data(model, embedding, unseen_file_path, output_file)


Loading saved Word2Vec model from /kaggle/working/id-domain_fasttext.model...
Model loaded successfully!
Predicting on unseen dataset and computing evaluation metrics...
Saved predictions for unseen dataset to fasttext-unseen_test_predictions_with_true_labels.csv
Evaluation Metrics:
Mean Squared Error (MSE): 4.4608
Mean Absolute Error (MAE): 1.7485
Pearson Correlation: 0.1149


In [23]:
import pandas as pd

# Function to save raw and preprocessed data
def save_raw_and_preprocessed(raw_data, preprocessed_data1, preprocessed_data2, labels, filename):
    # Convert to DataFrame
    df = pd.DataFrame({
        "Raw Sentence 1": [item[0] for item in raw_data],
        "Raw Sentence 2": [item[1] for item in raw_data],
        "Preprocessed Sentence 1": preprocessed_data1,
        "Preprocessed Sentence 2": preprocessed_data2,
        "Label": labels
    })
    df.to_csv(filename, index=False)
    print(f"Saved dataset to {filename}")

In [24]:
# Save raw and preprocessed datasets
save_raw_and_preprocessed(train_data, x_train1, x_train2, y_train, "train_data_with_preprocessing.csv")
save_raw_and_preprocessed(valid_data, x_valid1, x_valid2, y_valid, "valid_data_with_preprocessing.csv")
save_raw_and_preprocessed(test_data, x_test1, x_test2, y_test, "test_data_with_preprocessing.csv")

Saved dataset to train_data_with_preprocessing.csv
Saved dataset to valid_data_with_preprocessing.csv
Saved dataset to test_data_with_preprocessing.csv
