In [None]:
import os
import requests

# URL untuk Wikipedia Bahasa Indonesia
url = "https://dumps.wikimedia.org/idwiki/latest/idwiki-latest-pages-articles.xml.bz2"

# Nama file output
output_file = "idwiki-latest-pages-articles.xml.bz2"

# Periksa apakah file sudah ada
if not os.path.exists(output_file):
    print("Mengunduh file Wikipedia Bahasa Indonesia...")
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    chunk_size = 1024 * 1024  # 1 MB per chunk

    with open(output_file, "wb") as file:
        for data in response.iter_content(chunk_size=chunk_size):
            file.write(data)
    
    print("Unduhan selesai. File disimpan sebagai:", output_file)
else:
    print("File sudah ada:", output_file)

In [None]:
!pip install gensim

import io
import time
from datetime import timedelta
import gensim
import os

if __name__ == '__main__':
    start_time = time.time()

    # Path input file Wikipedia XML
    input_file = 'idwiki-latest-pages-articles.xml.bz2'
    assert os.path.exists(input_file), "File 'idwiki-latest-pages-articles.xml.bz2' tidak ditemukan!"

    print('Streaming wiki...')
    id_wiki = gensim.corpora.WikiCorpus(
        input_file, dictionary={}, lower=True
    )
    
    # Path output file teks
    output_file = 'idwiki_new_lower.txt'
    article_count = 0

    with io.open(output_file, 'w', encoding='utf-8') as wiki_txt:
        for text in id_wiki.get_texts():
            # Menulis artikel ke file teks
            wiki_txt.write(" ".join(text) + '\n')
            article_count += 1

            # Progress log setiap 10.000 artikel
            if article_count % 10000 == 0:
                print('{} articles processed'.format(article_count))
        
        print('Total: {} articles processed.'.format(article_count))

    finish_time = time.time()
    print('Elapsed time: {}'.format(timedelta(seconds=finish_time - start_time)))


In [None]:
import time
import os
import multiprocessing
from datetime import timedelta
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

if __name__ == '__main__':
    start_time = time.time()
    print('Training Word2Vec Model...')

    # Path input file teks hasil proses sebelumnya
    input_file = '/kaggle/working/idwiki_new_lower.txt'
    assert os.path.exists(input_file), "File 'idwiki_new_lower.txt' tidak ditemukan! Pastikan proses sebelumnya berhasil."

    # Path output model
    output_dir = 'model'
    output_file = os.path.join(output_dir, 'idwiki_word2vec_200_new_lower.model')

    # Membuat direktori output jika belum ada
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Melatih model Word2Vec
    sentences = LineSentence(input_file)
    id_w2v = Word2Vec(sentences, vector_size=200, workers=multiprocessing.cpu_count() - 1)

    # Menyimpan model
    id_w2v.save(output_file)

    finish_time = time.time()
    print(f'Finished. Elapsed time: {timedelta(seconds=finish_time - start_time)}')

    # Informasi lokasi file model
    print(f"Model saved at: {output_file}")


In [1]:
from gensim.models import Word2Vec

# Load pre-trained Word2Vec model
word2vec_model = Word2Vec.load('/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model')

In [2]:
import nltk
import string
from nltk.tokenize import word_tokenize

# Sample Indonesian text
sample_text = "Ini adalah contoh kalimat dalam bahasa Indonesia."

# Tokenize the text
nltk.download('punkt')  # Download tokenizer data if not already downloaded
tokenized_text = word_tokenize(sample_text.lower())  # Convert to lowercase for consistency

# Filter out punctuation
tokenized_text = [word for word in tokenized_text if word not in string.punctuation]

print("Tokenized words (no punctuation):", tokenized_text)

# Check for each word in the Word2Vec model
for word in tokenized_text:
    if word in word2vec_model.wv:  # Check if the word exists in the model vocabulary
        print(f"Word: {word}, Vector: {word2vec_model.wv[word][:5]}...")  # Show first 5 dimensions
    else:
        print(f"Word '{word}' is not in the vocabulary.")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Tokenized words (no punctuation): ['ini', 'adalah', 'contoh', 'kalimat', 'dalam', 'bahasa', 'indonesia']
Word: ini, Vector: [1.4045461 1.3396655 1.0604098 1.5734333 3.5066075]...
Word: adalah, Vector: [-3.1721826 -1.6112142 -4.920957  -0.5853852  0.6956718]...
Word: contoh, Vector: [ 1.2200239  -0.57646936 -0.94530183  0.19300076  2.1011887 ]...
Word: kalimat, Vector: [ 0.89005333  1.5616243   3.1428714  -0.84587735  0.4537459 ]...
Word: dalam, Vector: [-0.06166538  0.69772893 -1.8301325  -0.46174216 -0.14013124]...
Word: bahasa, Vector: [-0.5654576   0.58922666  2.964573    1.6541932   3.4733458 ]...
Word: indonesia, Vector: [-0.8432819   0.03616891  3.1263888  -3.3953204  -0.34944582]...


In [3]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import warnings
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk

In [5]:
# Ensure required NLTK data is downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# ====== Dataset Loading and Splitting ======
def load_indo_dataset(filename):
    df = pd.read_csv(filename)
    data = [
        (row['answer'], row['response'], row['label'] / 5.0)  # Normalize label
        for _, row in df.iterrows()
    ]
    return data

In [7]:
def split_dataset(data, valid_percentage=0.2, test_percentage=0.2):
    random.shuffle(data)
    train_size = int(len(data) * (1 - valid_percentage - test_percentage))
    valid_size = int(len(data) * valid_percentage)
    train = data[:train_size]
    valid = data[train_size:train_size + valid_size]
    test = data[train_size + valid_size:]
    return train, valid, test


In [8]:
def preprocess_text_indo(text):
    # Normalize text
    text = text.lower()

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords only (no stemming)
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    
    tokens = [stopword_remover.remove(word) for word in tokens if word.isalpha()]
    return " ".join(tokens)

In [9]:
def preprocess_data(data):
    sentence1 = [preprocess_text_indo(item[0]) for item in data]
    sentence2 = [preprocess_text_indo(item[1]) for item in data]
    labels = [item[2] for item in data]
    return sentence1, sentence2, labels

In [10]:
# ====== Load Pre-Trained Word2Vec Model ======
def load_pretrained_word_embedding(path):
    print(f"Loading pre-trained Word2Vec model from: {path}")
    return Word2Vec.load(path)

In [11]:
# Define your custom cosine similarity function
def cosine_similarity_custom(vec1, vec2):
    """
    Custom implementation of cosine similarity.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:  # Handle zero-vector case
        return 0.0
    
    return dot_product / (norm_vec1 * norm_vec2)

In [12]:
def average_cosine_similarity(kalimat1, kalimat2, embedding, method="w2v"):
    """
    Generate sentence features using Word2Vec, FastText.
    """
    if method in ["w2v", "fast"]:
        similarities = []
        for text1, text2 in zip(kalimat1, kalimat2):
            tokens1 = [word for word in text1.split() if word in embedding.wv]
            tokens2 = [word for word in text2.split() if word in embedding.wv]
            
            if not tokens1 or not tokens2:
                similarities.append(0)
                continue
            
            vec1 = np.mean([embedding.wv[word] for word in tokens1], axis=0).reshape(1, -1)
            vec2 = np.mean([embedding.wv[word] for word in tokens2], axis=0).reshape(1, -1)
            
            similarities.append(cosine_similarity_custom(vec1.flatten(), vec2.flatten()))
        return np.array(similarities).reshape(-1, 1)

    else:
        raise ValueError("Unsupported method.")

In [13]:
def word_distance(text_group1, text_group2, embed_model):
    similarity_scores = []
    for group1, group2 in zip(text_group1, text_group2):
        group1_tokens = [word for word in group1.split() if word in embed_model.wv]
        group2_tokens = [word for word in group2.split() if word in embed_model.wv]

        if not group1_tokens or not group2_tokens:
            similarity_scores.append(0)
        else:
            similarity_scores.append(-embed_model.wv.wmdistance(group1_tokens, group2_tokens))

    return np.array(similarity_scores).reshape(-1, 1)

In [14]:
# for SIF
from sklearn.decomposition import TruncatedSVD
    
def eliminate_first_component(matrix):
    svd_model = TruncatedSVD(n_components=1, random_state=42)
    svd_model.fit(matrix)
    principal_component = svd_model.components_
    return matrix - matrix.dot(principal_component.T) * principal_component
    
def sif_cos(text_group1, text_group2, embed_model, frequency_map, smoothing_factor=0.001):
    freq_sum = sum(frequency_map.values())
    all_embeddings = []

    for group1, group2 in zip(text_group1, text_group2):
        tokens1 = [word for word in group1.split() if word in embed_model.wv]
        tokens2 = [word for word in group2.split() if word in embed_model.wv]

        if not tokens1 or not tokens2:
            all_embeddings.extend([np.zeros(embed_model.vector_size), np.zeros(embed_model.vector_size)])
            continue

        weights1 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens1]
        weights2 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens2]

        embedding1 = np.average([embed_model.wv[word] for word in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([embed_model.wv[word] for word in tokens2], axis=0, weights=weights2)

        all_embeddings.extend([embedding1, embedding2])

    all_embeddings = np.array(all_embeddings)
    all_embeddings = eliminate_first_component(all_embeddings)

    similarities = [
        (
            np.dot(all_embeddings[i], all_embeddings[i + 1]) /
            (np.linalg.norm(all_embeddings[i]) * np.linalg.norm(all_embeddings[i + 1]))
            if np.linalg.norm(all_embeddings[i]) > 0 and np.linalg.norm(all_embeddings[i + 1]) > 0 else 0
        )
        for i in range(0, len(all_embeddings), 2)
    ]

    return np.array(similarities).reshape(-1, 1)

In [15]:
def feature_extraction(train_set1, train_set2, val_set1, val_set2, test_set, embed_model, frequency_map, method):
    if method == "averageCosine":
        train_similarities = average_cosine_similarity(train_set1, train_set2, embed_model)
        val_similarities = average_cosine_similarity(val_set1, val_set2, embed_model)
        test_similarities = average_cosine_similarity(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "wordDis":
        train_similarities = word_distance(train_set1, train_set2, embed_model)
        val_similarities = word_distance(val_set1, val_set2, embed_model)
        test_similarities = word_distance(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "sifCos":
        train_similarities = sif_cos(train_set1, train_set2, embed_model, frequency_map)
        val_similarities = sif_cos(val_set1, val_set2, embed_model, frequency_map)
        test_similarities = sif_cos(test_set["sentence1"], test_set["sentence2"], embed_model, frequency_map)
    else:
        raise ValueError(f"Feature extraction method '{method}' is not supported.")

    return np.array(train_similarities), np.array(val_similarities), np.array(test_similarities)

In [16]:

# ====== Regression Model ======
class RegressionModel:
    def __init__(self, model_type="linear"):
        if model_type == "linear":
            self.model = self.LinearRegressionCustom()
        elif model_type == "svr":
            self.model = SVR(kernel="linear")
        elif model_type == "rfr":
            self.model = self.RandomForestCustom()
        else:
            raise ValueError("Unsupported model type.")

    class LinearRegressionCustom:
        def __init__(self):
            self.weights = None

        def fit(self, X, y):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y
        def fit(self, X, y):
            # Convert y to NumPy and ensure matching rows
            y = np.array(y)
            if X.shape[0] != y.shape[0]:
                raise ValueError(f"Shape mismatch: X has {X.shape[0]} rows but y has {y.shape[0]} rows.")
            
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y

        def predict(self, X):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            return X @ self.weights

    class RandomForestCustom:
        def __init__(self, n_estimators=100, max_depth=None):
            self.n_estimators = n_estimators
            self.max_depth = max_depth
            self.trees = []
    
        def fit(self, X, y):
            from sklearn.tree import DecisionTreeRegressor
    
            # Ensure y is a NumPy array
            y = np.array(y)
    
            n_samples = X.shape[0]
    
            for _ in range(self.n_estimators):
                # Ensure indices are integers for proper indexing
                indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
                X_sample = X[indices]
                y_sample = y[indices]
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X_sample, y_sample)
                self.trees.append(tree)
    
        def predict(self, X):
            # Aggregate predictions from all trees
            predictions = np.array([tree.predict(X) for tree in self.trees])
            return np.mean(predictions, axis=0)

    @staticmethod
    def mean_squared_error(y_true, y_pred):
        squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
        return sum(squared_errors) / len(squared_errors)

    @staticmethod
    def mean_absolute_error(y_true, y_pred):
        absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
        return sum(absolute_errors) / len(absolute_errors)

    @staticmethod
    def pearsonr(x, y):
        mean_x = sum(x) / len(x)
        mean_y = sum(y) / len(y)
        numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
        denominator = ((sum((xi - mean_x) ** 2 for xi in x) * sum((yi - mean_y) ** 2 for yi in y)) ** 0.5)
        return (numerator / denominator if denominator != 0 else 0.0, None)

    def train(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def evaluate(self, x, y):
        predictions = self.model.predict(x)
        mse = self.mean_squared_error(y, predictions)
        mae = self.mean_absolute_error(y, predictions)
        pearson_corr, _ = self.pearsonr(y, predictions)
        return mse, mae, pearson_corr

    def predict(self, x):
        return self.model.predict(x)

In [17]:
pip install cvxopt

Collecting cvxopt
  Downloading cvxopt-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading cvxopt-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.6/13.6 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hInstalling collected packages: cvxopt
Successfully installed cvxopt-1.3.2
Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install POT

Collecting POT
  Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (865 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.6/865.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.5
Note: you may need to restart the kernel to use updated packages.


In [19]:
import random
# ====== Main Workflow ======
# Load and preprocess dataset
# Load Indonesian dataset
raw_data = load_indo_dataset("/kaggle/input/indo-datasets/indodata.csv")
train_data, valid_data, test_data = split_dataset(raw_data)
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

In [None]:
# ====== Load Dataset ======
# Load train, validation, and test datasets from CSV files
train_file = "/kaggle/input/data-w2v/train_data-w2v (1).csv"
valid_file = "/kaggle/input/data-w2v/valid_data-w2v (1).csv"
test_file = "/kaggle/input/data-w2v/test_data-w2v (1).csv"

# Read datasets
train_data = pd.read_csv(train_file).values
valid_data = pd.read_csv(valid_file).values
test_data = pd.read_csv(test_file).values

# ====== Preprocessing ======
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

# Output shapes for verification
print(f"Train data: {len(x_train1)} pairs, {len(y_train)} labels")
print(f"Validation data: {len(x_valid1)} pairs, {len(y_valid)} labels")
print(f"Test data: {len(x_test1)} pairs, {len(y_test)} labels")

In [20]:
import pandas as pd

# ====== Frequency Computation for SIF ======
from collections import Counter
all_sentences = x_train1 + x_train2 + x_valid1 + x_valid2 + x_test1 + x_test2
# Compute word frequencies for SIF
all_tokens = [token for sentence in all_sentences for token in sentence.split()]
freqs = Counter(all_tokens)

# ====== Feature Extraction and Model Evaluation ======
methods = ["averageCosine", "wordDis", "sifCos"]
# Store results for all methods and models
results = []
word2vec_model_path = '/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model'
# Load pre-trained Word2Vec model
embedding = load_pretrained_word_embedding(word2vec_model_path)

Loading pre-trained Word2Vec model from: /kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model


In [21]:
from sklearn.svm import SVR

In [22]:
for method in methods:
    current_method = method
    print(f"Using feature extraction method: {current_method}")
    
    x_train_features, x_valid_features, x_test_features = feature_extraction(
        x_train1, x_train2, x_valid1, x_valid2, {"sentence1": x_test1, "sentence2": x_test2}, embedding, freqs, current_method
    )

    for reg_model in ["linear", "svr", "rfr"]:
        model = RegressionModel(model_type=reg_model)
        model.train(x_train_features, y_train)

        # Evaluate on validation and test sets
        val_mse, val_mae, val_pearson = model.evaluate(x_valid_features, y_valid)
        test_mse, test_mae, test_pearson = model.evaluate(x_test_features, y_test)

        print(f"Validation Performance ({current_method}, {reg_model}):")
        print(f"MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, Pearson Correlation: {val_pearson:.4f}")

        print(f"Test Performance ({current_method}, {reg_model}):")
        print(f"MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, Pearson Correlation: {test_pearson:.4f}")

        # Store results
        results.append({
            "method": current_method,
            "model": reg_model,
            "pearson": test_pearson,
            "test_predictions": model.predict(x_test_features),
            "test_features": x_test_features,
            "x_test1": x_test1,
            "x_test2": x_test2,
            "y_test": y_test
        })

# Save all results as CSV files (for both validation and test)
for i, result in enumerate(results, start=1):
    # Retrieve original raw sentences for validation and test
    raw_valid1 = [item[0] for item in valid_data]  # Original raw Sentence 1 for validation
    raw_valid2 = [item[1] for item in valid_data]  # Original raw Sentence 2 for validation
    raw_test1 = [item[0] for item in test_data]  # Original raw Sentence 1 for test
    raw_test2 = [item[1] for item in test_data]  # Original raw Sentence 2 for test
    val_predictions = model.predict(x_valid_features)  # Prediksi untuk data validasi
    # Create DataFrame for validation predictions
    val_df = pd.DataFrame({
        "Original Sentence 1": raw_valid1,  # Append raw sentence 1
        "Original Sentence 2": raw_valid2,  # Append raw sentence 2
        "Preprocessed Sentence 1": x_valid1,  # Preprocessed validation sentence 1
        "Preprocessed Sentence 2": x_valid2,  # Preprocessed validation sentence 2
        "True Similarity Score": [y * 5 for y in y_valid],  # Rescale validation true scores
        "Predicted Similarity Score": [y * 5 for y in val_predictions]  # Validation predictions rescaled
    })

    # Save validation result CSV
    val_filename = f"val_result_{i}_{result['method']}_{result['model']}.csv"
    val_df.to_csv(val_filename, index=False)
    print(f"Saved validation result: {val_filename}")

    # Create DataFrame for test predictions
    test_df = pd.DataFrame({
        "Original Sentence 1": raw_test1,  # Append raw sentence 1
        "Original Sentence 2": raw_test2,  # Append raw sentence 2
        "Preprocessed Sentence 1": result["x_test1"],  # Preprocessed test sentence 1
        "Preprocessed Sentence 2": result["x_test2"],  # Preprocessed test sentence 2
        "True Similarity Score": [y * 5 for y in result["y_test"]],  # Rescale to [0, 5]
        "Predicted Similarity Score": [y * 5 for y in result["test_predictions"]]  # Predicted scores rescaled
    })

    # Save test result CSV
    test_filename = f"test_result_{i}_{result['method']}_{result['model']}.csv"
    test_df.to_csv(test_filename, index=False)
    print(f"Saved test result: {test_filename}")

Using feature extraction method: averageCosine
Validation Performance (averageCosine, linear):
MSE: 0.0613, MAE: 0.2009, Pearson Correlation: 0.7154
Test Performance (averageCosine, linear):
MSE: 0.1833, MAE: 0.2295, Pearson Correlation: 0.4484
Validation Performance (averageCosine, svr):
MSE: 0.0629, MAE: 0.1983, Pearson Correlation: 0.7154
Test Performance (averageCosine, svr):
MSE: 0.1869, MAE: 0.2267, Pearson Correlation: 0.4484
Validation Performance (averageCosine, rfr):
MSE: 0.0646, MAE: 0.1811, Pearson Correlation: 0.7176
Test Performance (averageCosine, rfr):
MSE: 0.1787, MAE: 0.2137, Pearson Correlation: 0.4994
Using feature extraction method: wordDis
Validation Performance (wordDis, linear):
MSE: 0.0347, MAE: 0.1418, Pearson Correlation: 0.8507
Test Performance (wordDis, linear):
MSE: 0.1538, MAE: 0.1784, Pearson Correlation: 0.5694
Validation Performance (wordDis, svr):
MSE: 0.0347, MAE: 0.1399, Pearson Correlation: 0.8507
Test Performance (wordDis, svr):
MSE: 0.1556, MAE: 

In [24]:
import pandas as pd
import os

# Save datasets to CSV files
def save_splits_to_csv(train_data, val_data, test_data, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    train_df = pd.DataFrame(train_data, columns=["response", "answer", "label"])
    val_df = pd.DataFrame(val_data, columns=["response", "answer", "label"])
    test_df = pd.DataFrame(test_data, columns=["response", "answer", "label"])

    train_df.to_csv(os.path.join(output_dir, "train_data.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "val_data.csv"), index=False)
    test_df.to_csv(os.path.join(output_dir, "test_data.csv"), index=False)

    print(f"Data saved to {output_dir} successfully.")
    
# Save splits to CSV
output_directory = "output_splits"
save_splits_to_csv(train_data, valid_data, test_data, output_directory)

Data saved to output_splits successfully.


In [34]:
results = []
# ====== Load Dataset ======
# Load train, validation, and test datasets from CSV files
train_file = "/kaggle/working/output_splits/train_data.csv"
valid_file = "/kaggle/working/output_splits/val_data.csv"
unseen_file = "/kaggle/input/test-data/test-BuIng.csv"

# Read datasets
train_data = pd.read_csv(train_file).values
valid_data = pd.read_csv(valid_file).values
unseen_data = pd.read_csv(unseen_file).values

# ====== Preprocessing ======
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_unseen1, x_unseen2, y_unseen = preprocess_data(unseen_data)

# Output shapes for verification
print(f"Train data: {len(x_train1)} pairs, {len(y_train)} labels")
print(f"Validation data: {len(x_valid1)} pairs, {len(y_valid)} labels")
print(f"unseen data: {len(x_unseen1)} pairs, {len(y_unseen)} labels")

Train data: 434 pairs, 434 labels
Validation data: 144 pairs, 144 labels
unseen data: 25 pairs, 25 labels


In [35]:
for method in methods:
    current_method = method
    print(f"Using feature extraction method: {current_method}")
    
    x_train_features, x_valid_features, x_unseen_features = feature_extraction(
        x_train1, x_train2, x_valid1, x_valid2, {"sentence1": x_unseen1, "sentence2": x_unseen2}, embedding, freqs, current_method
    )

    for reg_model in ["linear", "svr", "rfr"]:
        model = RegressionModel(model_type=reg_model)
        model.train(x_train_features, y_train)

        # Evaluate on validation and test sets
        val_mse, val_mae, val_pearson = model.evaluate(x_valid_features, y_valid)
        unseen_mse, unseen_mae, unseen_pearson = model.evaluate(x_unseen_features, y_unseen)

        print(f"Validation Performance ({current_method}, {reg_model}):")
        print(f"MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, Pearson Correlation: {val_pearson:.4f}")

        print(f"Test Performance ({current_method}, {reg_model}):")
        print(f"MSE: {unseen_mse:.4f}, MAE: {unseen_mae:.4f}, Pearson Correlation: {unseen_pearson:.4f}")

    
        # Ensure x_unseen1, x_unseen2, and y_unseen are properly stored
        results.append({
            "method": current_method,
            "model": reg_model,
            "pearson": unseen_pearson,
            "unseen_predictions": list(model.predict(x_unseen_features)),  # Convert predictions to list
            "unseen_features": list(x_unseen_features) if x_unseen_features is not None else [],  # Ensure it's a list
            "x_unseen1": list(x_unseen1) if x_unseen1 else [],  # Ensure stored properly
            "x_unseen2": list(x_unseen2) if x_unseen2 else [],  # Ensure stored properly
            "y_unseen": list(y_unseen) if y_unseen else []  # Ensure stored properly
        })


# Save all results as CSV files (for both validation and test)
for i, result in enumerate(results, start=1):
    # Create DataFrame for test predictions
    unseen_df = pd.DataFrame({
        "Original Sentence 1": raw_unseen1,
        "Original Sentence 2": raw_unseen2,
        "Preprocessed Sentence 1": list(result.get("x_unseen1", [])),  # Use get() to avoid KeyError
        "Preprocessed Sentence 2": list(result.get("x_unseen2", [])),  # Use get() to avoid KeyError
        "True Similarity Score": [y * 5 for y in result.get("y_unseen", [])],
        "Predicted Similarity Score": [y * 5 for y in result.get("unseen_predictions", [])]
    })


    # Save test result CSV
    test_filename = f"unseen_result_{i}_{result['method']}_{result['model']}.csv"
    unseen_df.to_csv(test_filename, index=False)
    print(f"Saved unseen result: {test_filename}")

Using feature extraction method: averageCosine
Validation Performance (averageCosine, linear):
MSE: 0.0613, MAE: 0.2009, Pearson Correlation: 0.7154
Test Performance (averageCosine, linear):
MSE: 11.4567, MAE: 3.1978, Pearson Correlation: 0.0445
Validation Performance (averageCosine, svr):
MSE: 0.0629, MAE: 0.1983, Pearson Correlation: 0.7154
Test Performance (averageCosine, svr):
MSE: 11.3523, MAE: 3.1809, Pearson Correlation: 0.0445
Validation Performance (averageCosine, rfr):
MSE: 0.0637, MAE: 0.1809, Pearson Correlation: 0.7227
Test Performance (averageCosine, rfr):
MSE: 12.2501, MAE: 3.3116, Pearson Correlation: -0.0661
Using feature extraction method: wordDis
Validation Performance (wordDis, linear):
MSE: 0.0347, MAE: 0.1418, Pearson Correlation: 0.8507
Test Performance (wordDis, linear):
MSE: 12.3164, MAE: 3.3321, Pearson Correlation: 0.1035
Validation Performance (wordDis, svr):
MSE: 0.0347, MAE: 0.1399, Pearson Correlation: 0.8507
Test Performance (wordDis, svr):
MSE: 12.4221,

In [32]:
print("Lengths of unseen data components:")
print(f"raw_unseen1: {len(raw_unseen1)}")
print(f"raw_unseen2: {len(raw_unseen2)}")
print(f"x_unseen1: {len(result.get('x_unseen1', []))}")
print(f"x_unseen2: {len(result.get('x_unseen2', []))}")
print(f"y_unseen: {len(result.get('y_unseen', []))}")
print(f"unseen_predictions: {len(result.get('unseen_predictions', []))}")


Lengths of unseen data components:
raw_unseen1: 25
raw_unseen2: 25
x_unseen1: 0
x_unseen2: 0
y_unseen: 0
unseen_predictions: 0


In [None]:
class KeyedVectorsWrapper:
    """Wrapper for KeyedVectors to provide a .wv attribute."""
    def __init__(self, keyed_vectors):
        self.wv = keyed_vectors


In [None]:
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
# ====== Function to Load and Preprocess Unseen Dataset ======
def load_unseen_indo_dataset(filename):
    """Load and preprocess new unseen Indonesian dataset."""
    df = pd.read_csv(filename)
    data = [(row['answer'], row['response']) for _, row in df.iterrows()]  # No labels in unseen data
    return data

def preprocess_unseen_data(data):
    """Preprocess unseen data for predictions."""
    sentence1 = [preprocess_text_indo(item[0]) for item in data]  # Preprocess first sentence
    sentence2 = [preprocess_text_indo(item[1]) for item in data]  # Preprocess second sentence
    return sentence1, sentence2

# ====== Load Saved Embedding Model ======
def load_saved_embedding_model(model_path, method):
    """Load saved Word2Vec or FastText model and handle KeyedVectors compatibility."""
    if method == "w2v":
        print(f"Loading saved Word2Vec model from {model_path}...")
        embedding = Word2Vec.load(model_path)
    elif method == "fast":
        print(f"Loading saved FastText model from {model_path}...")
        embedding = KeyedVectors.load_word2vec_format(model_path, binary=True)
    else:
        raise ValueError("Unsupported embedding method.")
    print(f"Model loaded successfully!")
    return embedding

# ====== Compute Evaluation Metrics ======
def compute_evaluation_metrics(y_true, y_pred):
    """Compute MSE, MAE, and Pearson Correlation."""
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    pearson_corr, _ = pearsonr(y_true, y_pred) if len(y_true) > 1 else (0, None)
    return mse, mae, pearson_corr
    
# ====== Corrected Predict Unseen Data Function ======
def predict_unseen_data(model, embedding, unseen_file_path, output_file):
    """
    Predict similarity scores on unseen dataset, save results, and compute evaluation metrics.
    
    Parameters:
        model: Trained regression model.
        embedding: Trained embedding model (Word2Vec or FastText).
        unseen_file_path: Path to the unseen dataset CSV.
        output_file: Path to save the results.
    """
    # Load unseen dataset
    unseen_data = pd.read_csv(unseen_file_path)
    
    # Extract true labels
    if "label" in unseen_data.columns:
        y_true = unseen_data["label"].values / 5.0  # Normalize true labels to 0-1
    else:
        y_true = None

    # Preprocess unseen dataset
    unseen_sentence1 = [preprocess_text_indo(item) for item in unseen_data["answer"]]
    unseen_sentence2 = [preprocess_text_indo(item) for item in unseen_data["response"]]
    
    # Extract features using the embedding
    unseen_features = average_cosine_similarity(unseen_sentence1, unseen_sentence2, embedding)  # Pass embedding directly
    
    # Make predictions
    predictions = model.predict(unseen_features)
    
    # Save predictions to CSV
    df = pd.DataFrame({
        "Original Sentence 1": unseen_data["answer"],
        "Original Sentence 2": unseen_data["response"],
        "Predicted Similarity Score": predictions * 5  # Rescale the scores to 0-5
    })

    # Include true labels if available
    if y_true is not None:
        df["True Similarity Score"] = unseen_data["label"]  # Keep original scale (0-5)
    
    df.to_csv(output_file, index=False)
    print(f"Saved predictions for unseen dataset to {output_file}")
    
    # If true scores are provided, compute evaluation metrics
    if y_true is not None:
        y_true_rescaled = [y * 5 for y in y_true]  # Rescale true scores to 0-5
        predictions_rescaled = predictions * 5  # Rescale predictions to 0-5
        mse, mae, pearson_corr = compute_evaluation_metrics(y_true_rescaled, predictions_rescaled)
        print(f"Evaluation Metrics:\n"
              f"Mean Squared Error (MSE): {mse:.4f}\n"
              f"Mean Absolute Error (MAE): {mae:.4f}\n"
              f"Pearson Correlation: {pearson_corr:.4f}")
        return mse, mae, pearson_corr
    else:
        print("True scores not provided. Skipping evaluation metrics.")
        return None, None, None

# ====== Main Workflow for Testing on Unseen Data ======
unseen_file_path = "/kaggle/input/testi-data/test-BuIng.csv"
output_file = "w2v-unseen_test_predictions_with_true_labels.csv"

# Load the saved embedding model
embedding_model_path = "/kaggle/working/id-domain_w2v.model"  # Use the correct path
embedding = load_saved_embedding_model(embedding_model_path, method="w2v")  # Or "fast" for FastText

# Predict and evaluate on the unseen dataset
print("Predicting on unseen dataset and computing evaluation metrics...")
mse, mae, pearson_corr = predict_unseen_data(model, embedding, unseen_file_path, output_file)

In [None]:
# ====== Main Workflow ======
# Paths and configurations
file_path = '/kaggle/input/indo-datasets/indodata.csv'
word2vec_model_path = '/kaggle/working/model/idwiki_word2vec_200_new_lower.model'

In [None]:
# Load and preprocess dataset
raw_data = load_indo_dataset(file_path)
train_data, valid_data, test_data = split_dataset(raw_data, valid_percentage=0.1, test_percentage=0.1)

x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

# Load pre-trained Word2Vec model
embedding = load_pretrained_word_embedding(word2vec_model_path)

In [None]:
import pandas as pd

# Function to save raw and preprocessed data
def save_raw_and_preprocessed(raw_data, preprocessed_data1, preprocessed_data2, labels, filename):
    # Convert to DataFrame
    df = pd.DataFrame({
        "Raw Sentence 1": [item[0] for item in raw_data],
        "Raw Sentence 2": [item[1] for item in raw_data],
        "Preprocessed Sentence 1": preprocessed_data1,
        "Preprocessed Sentence 2": preprocessed_data2,
        "Label": labels
    })
    df.to_csv(filename, index=False)
    print(f"Saved dataset to {filename}")

In [None]:
# Save raw and preprocessed datasets
save_raw_and_preprocessed(train_data, x_train1, x_train2, y_train, "train_data_with_preprocessing.csv")
save_raw_and_preprocessed(valid_data, x_valid1, x_valid2, y_valid, "valid_data_with_preprocessing.csv")
save_raw_and_preprocessed(test_data, x_test1, x_test2, y_test, "test_data_with_preprocessing.csv")

In [None]:
# Define your custom cosine similarity function
def cosine_similarity_custom(vec1, vec2):
    """
    Custom implementation of cosine similarity.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:  # Handle zero-vector case
        return 0.0
    
    return dot_product / (norm_vec1 * norm_vec2)

In [None]:
def avgCos(sentences1, sentences2, embedding, method="w2v"):
    """
    Generate sentence features using Word2Vec, FastText, BoW, or TF-IDF.
    """
    if method in ["w2v", "fast"]:
        similarities = []
        for sent1, sent2 in zip(sentences1, sentences2):
            tokens1 = [token for token in sent1.split() if token in embedding.wv]
            tokens2 = [token for token in sent2.split() if token in embedding.wv]
            
            if not tokens1 or not tokens2:
                similarities.append(0)
                continue
            
            vec1 = np.mean([embedding.wv[token] for token in tokens1], axis=0).reshape(1, -1)
            vec2 = np.mean([embedding.wv[token] for token in tokens2], axis=0).reshape(1, -1)
            
            similarities.append(cosine_similarity_custom(vec1.flatten(), vec2.flatten()))
        return np.array(similarities).reshape(-1, 1)
    
    elif method in ["bow", "tfidf"]:
        # Ensure both sets of sentences are vectorized and have the same shape
        vectorized1 = embedding.transform(sentences1)
        vectorized2 = embedding.transform(sentences2)
        return np.abs(vectorized1 - vectorized2)  # Feature difference as input

    else:
        raise ValueError("Unsupported method.")

In [None]:
# ====== Feature Extraction ======
x_train_features = avgCos(x_train1, x_train2, embedding)
x_valid_features = avgCos(x_valid1, x_valid2, embedding)
x_test_features = avgCos(x_test1, x_test2, embedding)

# ====== Regression Model and Evaluation ======
from sklearn.linear_model import LinearRegression

class RegressionModel:
    def __init__(self):
        self.model = LinearRegression()

    def train(self, X, y):
        self.model.fit(X, y)

    def evaluate(self, X, y):
        predictions = self.model.predict(X)
        mse = np.mean((y - predictions) ** 2)
        mae = np.mean(np.abs(y - predictions))
        corr = np.corrcoef(y, predictions)[0, 1]
        return mse, mae, corr

    def predict(self, X):
        return self.model.predict(X)

# Train and evaluate
model = RegressionModel()
model.train(x_train_features, y_train)

val_mse, val_mae, val_corr = model.evaluate(x_valid_features, y_valid)
test_mse, test_mae, test_corr = model.evaluate(x_test_features, y_test)

print(f"Validation: MSE={val_mse:.4f}, MAE={val_mae:.4f}, Pearson Correlation={val_corr:.4f}")
print(f"Test: MSE={test_mse:.4f}, MAE={test_mae:.4f}, Pearson Correlation={test_corr:.4f}")

# ====== Save Predictions ======
predictions = model.predict(x_test_features)
results = pd.DataFrame({
    "Sentence 1": x_test1,
    "Sentence 2": x_test2,
    "True Label": [y * 5 for y in y_test],
    "Predicted Label": [y * 5 for y in predictions]
})
results.to_csv("test_results.csv", index=False)
print("Results saved to test_results.csv")