In [2]:
from gensim.models import Word2Vec

# Load pre-trained Word2Vec model
word2vec_model = Word2Vec.load('/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model')

In [20]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [21]:
import os
import gensim
import nltk
import numpy as np
import pandas as pd
import random
from gensim.models import Word2Vec, KeyedVectors, FastText
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.decomposition import TruncatedSVD

In [4]:
# Ensure required NLTK data is downloaded
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# ====== Dataset Loading and Splitting ======
def load_indo_dataset(filename):
    df = pd.read_csv(filename)
    data = [
        (row['answer'], row['response'], row['label'] / 5.0)  # Normalize label
        for _, row in df.iterrows()
    ]
    return data

In [6]:
def split_dataset(data, valid_percentage=0.15, test_percentage=0.15):
    random.shuffle(data)
    train_size = int(len(data) * (1 - valid_percentage - test_percentage))
    valid_size = int(len(data) * valid_percentage)
    train = data[:train_size]
    valid = data[train_size:train_size + valid_size]
    test = data[train_size + valid_size:]
    return train, valid, test


In [7]:
def preprocess_text_indo(text):
    # Normalize text
    text = text.lower()

    # Tokenize text
    tokens = word_tokenize(text)
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    tokens = [stopword_remover.remove(word) for word in tokens if word.isalpha()]
    return " ".join(tokens)

In [8]:
def preprocess_data(data):
    sentence1 = [preprocess_text_indo(item[0]) for item in data]
    sentence2 = [preprocess_text_indo(item[1]) for item in data]
    labels = [item[2] for item in data]
    return sentence1, sentence2, labels

In [9]:
# ====== Load Pre-Trained Word2Vec Model ======
def load_pretrained_word_embedding(path):
    print(f"Loading pre-trained Word2Vec model from: {path}")
    return Word2Vec.load(path)

In [10]:
# Define your custom cosine similarity function
def cosine_similarity_custom(vec1, vec2):
    """
    Custom implementation of cosine similarity.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:  # Handle zero-vector case
        return 0.0
    
    return dot_product / (norm_vec1 * norm_vec2)

In [11]:
def average_cosine_similarity(kalimat1, kalimat2, embedding, method="w2v"):
    """
    Generate sentence features using Word2Vec, FastText.
    """
    if method in ["w2v", "fast"]:
        similarities = []
        for text1, text2 in zip(kalimat1, kalimat2):
            tokens1 = [word for word in text1.split() if word in embedding.wv]
            tokens2 = [word for word in text2.split() if word in embedding.wv]
            
            if not tokens1 or not tokens2:
                similarities.append(0)
                continue
            
            vec1 = np.mean([embedding.wv[word] for word in tokens1], axis=0).reshape(1, -1)
            vec2 = np.mean([embedding.wv[word] for word in tokens2], axis=0).reshape(1, -1)
            
            similarities.append(cosine_similarity_custom(vec1.flatten(), vec2.flatten()))
        return np.array(similarities).reshape(-1, 1)

    else:
        raise ValueError("Unsupported method.")

In [12]:
def word_distance(text_group1, text_group2, embed_model):
    similarity_scores = []
    for group1, group2 in zip(text_group1, text_group2):
        group1_tokens = [word for word in group1.split() if word in embed_model.wv]
        group2_tokens = [word for word in group2.split() if word in embed_model.wv]

        if not group1_tokens or not group2_tokens:
            similarity_scores.append(0)
        else:
            similarity_scores.append(-embed_model.wv.wmdistance(group1_tokens, group2_tokens))

    return np.array(similarity_scores).reshape(-1, 1)

In [13]:
# for SIF
from sklearn.decomposition import TruncatedSVD
    
def eliminate_first_component(matrix):
    svd_model = TruncatedSVD(n_components=1, random_state=42)
    svd_model.fit(matrix)
    principal_component = svd_model.components_
    return matrix - matrix.dot(principal_component.T) * principal_component
    
def sif_cos(text_group1, text_group2, embed_model, frequency_map, smoothing_factor=0.001):
    freq_sum = sum(frequency_map.values())
    all_embeddings = []

    for group1, group2 in zip(text_group1, text_group2):
        tokens1 = [word for word in group1.split() if word in embed_model.wv]
        tokens2 = [word for word in group2.split() if word in embed_model.wv]

        if not tokens1 or not tokens2:
            all_embeddings.extend([np.zeros(embed_model.vector_size), np.zeros(embed_model.vector_size)])
            continue

        weights1 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens1]
        weights2 = [smoothing_factor / (smoothing_factor + frequency_map.get(word, 1e-5) / freq_sum) for word in tokens2]

        embedding1 = np.average([embed_model.wv[word] for word in tokens1], axis=0, weights=weights1)
        embedding2 = np.average([embed_model.wv[word] for word in tokens2], axis=0, weights=weights2)

        all_embeddings.extend([embedding1, embedding2])

    all_embeddings = np.array(all_embeddings)
    all_embeddings = eliminate_first_component(all_embeddings)

    similarities = [
        (
            np.dot(all_embeddings[i], all_embeddings[i + 1]) /
            (np.linalg.norm(all_embeddings[i]) * np.linalg.norm(all_embeddings[i + 1]))
            if np.linalg.norm(all_embeddings[i]) > 0 and np.linalg.norm(all_embeddings[i + 1]) > 0 else 0
        )
        for i in range(0, len(all_embeddings), 2)
    ]

    return np.array(similarities).reshape(-1, 1)

In [14]:
def feature_extraction(train_set1, train_set2, val_set1, val_set2, test_set, embed_model, frequency_map, method):
    if method == "averageCosine":
        train_similarities = average_cosine_similarity(train_set1, train_set2, embed_model)
        val_similarities = average_cosine_similarity(val_set1, val_set2, embed_model)
        test_similarities = average_cosine_similarity(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "wordDis":
        train_similarities = word_distance(train_set1, train_set2, embed_model)
        val_similarities = word_distance(val_set1, val_set2, embed_model)
        test_similarities = word_distance(test_set["sentence1"], test_set["sentence2"], embed_model)
    elif method == "sifCos":
        train_similarities = sif_cos(train_set1, train_set2, embed_model, frequency_map)
        val_similarities = sif_cos(val_set1, val_set2, embed_model, frequency_map)
        test_similarities = sif_cos(test_set["sentence1"], test_set["sentence2"], embed_model, frequency_map)
    else:
        raise ValueError(f"Feature extraction method '{method}' is not supported.")

    return np.array(train_similarities), np.array(val_similarities), np.array(test_similarities)

In [15]:
# ====== Regression Model ======
class RegressionModel:
    def __init__(self, model_type="linear"):
        if model_type == "linear":
            self.model = self.LinearRegressionCustom()
        elif model_type == "svr":
            self.model = SVR(kernel="linear")
        elif model_type == "rfr":
            self.model = self.RandomForestCustom()
        else:
            raise ValueError("Unsupported model type.")

    class LinearRegressionCustom:
        def __init__(self):
            self.weights = None

        def fit(self, X, y):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y
        def fit(self, X, y):
            # Convert y to NumPy and ensure matching rows
            y = np.array(y)
            if X.shape[0] != y.shape[0]:
                raise ValueError(f"Shape mismatch: X has {X.shape[0]} rows but y has {y.shape[0]} rows.")
            
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            self.weights = np.linalg.pinv(X.T @ X) @ X.T @ y

        def predict(self, X):
            X = np.c_[np.ones(X.shape[0]), X]  # Add bias term
            return X @ self.weights

    class RandomForestCustom:
        def __init__(self, n_estimators=100, max_depth=None):
            self.n_estimators = n_estimators
            self.max_depth = max_depth
            self.trees = []
    
        def fit(self, X, y):
            from sklearn.tree import DecisionTreeRegressor
    
            # Ensure y is a NumPy array
            y = np.array(y)
    
            n_samples = X.shape[0]
    
            for _ in range(self.n_estimators):
                # Ensure indices are integers for proper indexing
                indices = np.random.choice(range(n_samples), size=n_samples, replace=True)
                X_sample = X[indices]
                y_sample = y[indices]
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
                tree.fit(X_sample, y_sample)
                self.trees.append(tree)
    
        def predict(self, X):
            # Aggregate predictions from all trees
            predictions = np.array([tree.predict(X) for tree in self.trees])
            return np.mean(predictions, axis=0)

    @staticmethod
    def mean_squared_error(y_true, y_pred):
        squared_errors = [(true - pred) ** 2 for true, pred in zip(y_true, y_pred)]
        return sum(squared_errors) / len(squared_errors)

    @staticmethod
    def mean_absolute_error(y_true, y_pred):
        absolute_errors = [abs(true - pred) for true, pred in zip(y_true, y_pred)]
        return sum(absolute_errors) / len(absolute_errors)

    @staticmethod
    def pearsonr(x, y):
        mean_x = sum(x) / len(x)
        mean_y = sum(y) / len(y)
        numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
        denominator = ((sum((xi - mean_x) ** 2 for xi in x) * sum((yi - mean_y) ** 2 for yi in y)) ** 0.5)
        return (numerator / denominator if denominator != 0 else 0.0, None)

    def train(self, x_train, y_train):
        self.model.fit(x_train, y_train)

    def evaluate(self, x, y):
        predictions = self.model.predict(x)
        mse = self.mean_squared_error(y, predictions)
        mae = self.mean_absolute_error(y, predictions)
        pearson_corr, _ = self.pearsonr(y, predictions)
        return mse, mae, pearson_corr

    def predict(self, x):
        return self.model.predict(x)

In [16]:
pip install cvxopt

Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install POT

Collecting POT
  Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (865 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.6/865.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: POT
Successfully installed POT-0.9.5
Note: you may need to restart the kernel to use updated packages.


In [18]:
# ====== Load Dataset ======
# Load train, validation, and test datasets from CSV files
train_file = "/kaggle/input/70-indo-dataset/70-train_data.csv"
valid_file = "/kaggle/input/70-indo-dataset/15-val_data.csv"
test_file = "/kaggle/input/70-indo-dataset/15-test_data.csv"

# Read datasets
train_data = pd.read_csv(train_file).values
valid_data = pd.read_csv(valid_file).values
test_data = pd.read_csv(test_file).values

In [22]:
# ====== Preprocessing ======
x_train1, x_train2, y_train = preprocess_data(train_data)
x_valid1, x_valid2, y_valid = preprocess_data(valid_data)
x_test1, x_test2, y_test = preprocess_data(test_data)

# Output shapes for verification
print(f"Train data: {len(x_train1)} pairs, {len(y_train)} labels")
print(f"Validation data: {len(x_valid1)} pairs, {len(y_valid)} labels")
print(f"Test data: {len(x_test1)} pairs, {len(y_test)} labels")

Train data: 1291 pairs, 1291 labels
Validation data: 276 pairs, 276 labels
Test data: 278 pairs, 278 labels


In [23]:
import pandas as pd

# ====== Frequency Computation for SIF ======
from collections import Counter
all_sentences = x_train1 + x_train2 + x_valid1 + x_valid2 + x_test1 + x_test2
# Compute word frequencies for SIF
all_tokens = [token for sentence in all_sentences for token in sentence.split()]
freqs = Counter(all_tokens)

# ====== Feature Extraction and Model Evaluation ======
methods = ["averageCosine", "wordDis", "sifCos"]
# Store results for all methods and models
results = []
word2vec_model_path = '/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model'
# Load pre-trained Word2Vec model
embedding = load_pretrained_word_embedding(word2vec_model_path)

Loading pre-trained Word2Vec model from: /kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model


In [24]:
from sklearn.svm import SVR
import pickle


In [25]:
for method in methods:
    current_method = method
    print(f"Using feature extraction method: {current_method}")
    
    x_train_features, x_valid_features, x_test_features = feature_extraction(
        x_train1, x_train2, x_valid1, x_valid2, {"sentence1": x_test1, "sentence2": x_test2}, embedding, freqs, current_method
    )

    for reg_model in ["linear", "svr", "rfr"]:
        model = RegressionModel(model_type=reg_model)
        model.train(x_train_features, y_train)
        # Save trained model
        model_filename = f"trained_model_{method}_{reg_model}.pkl"
        with open(model_filename, "wb") as file:
            pickle.dump(model, file)
        print(f"Saved {reg_model} model using {method} feature extraction: {model_filename}")
        print("Trained regression model saved successfully!")
        # Evaluate on validation and test sets
        val_mse, val_mae, val_pearson = model.evaluate(x_valid_features, y_valid)
        test_mse, test_mae, test_pearson = model.evaluate(x_test_features, y_test)

        print(f"Validation Performance ({current_method}, {reg_model}):")
        print(f"MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, Pearson Correlation: {val_pearson:.4f}")

        print(f"Test Performance ({current_method}, {reg_model}):")
        print(f"MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, Pearson Correlation: {test_pearson:.4f}")

        # Store results
        results.append({
            "method": current_method,
            "model": reg_model,
            "pearson": test_pearson,
            "test_predictions": model.predict(x_test_features),
            "test_features": x_test_features,
            "x_test1": x_test1,
            "x_test2": x_test2,
            "y_test": y_test
        })

# Save all results as CSV files (for both validation and test)
for i, result in enumerate(results, start=1):
    # Retrieve original raw sentences for validation and test
    raw_valid1 = [item[0] for item in valid_data]  # Original raw Sentence 1 for validation
    raw_valid2 = [item[1] for item in valid_data]  # Original raw Sentence 2 for validation
    raw_test1 = [item[0] for item in test_data]  # Original raw Sentence 1 for test
    raw_test2 = [item[1] for item in test_data]  # Original raw Sentence 2 for test
    val_predictions = model.predict(x_valid_features)  # Prediksi untuk data validasi
    # Create DataFrame for validation predictions
    val_df = pd.DataFrame({
        "Original Sentence 1": raw_valid1,  # Append raw sentence 1
        "Original Sentence 2": raw_valid2,  # Append raw sentence 2
        "Preprocessed Sentence 1": x_valid1,  # Preprocessed validation sentence 1
        "Preprocessed Sentence 2": x_valid2,  # Preprocessed validation sentence 2
        "True Similarity Score": [y * 5 for y in y_valid],  # Rescale validation true scores
        "Predicted Similarity Score": [y * 5 for y in val_predictions]  # Validation predictions rescaled
    })

    # Save validation result CSV
    val_filename = f"val_result_{i}_{result['method']}_{result['model']}.csv"
    val_df.to_csv(val_filename, index=False)
    print(f"Saved validation result: {val_filename}")

    # Create DataFrame for test predictions
    test_df = pd.DataFrame({
        "Original Sentence 1": raw_test1,  # Append raw sentence 1
        "Original Sentence 2": raw_test2,  # Append raw sentence 2
        "Preprocessed Sentence 1": result["x_test1"],  # Preprocessed test sentence 1
        "Preprocessed Sentence 2": result["x_test2"],  # Preprocessed test sentence 2
        "True Similarity Score": [y * 5 for y in result["y_test"]],  # Rescale to [0, 5]
        "Predicted Similarity Score": [y * 5 for y in result["test_predictions"]]  # Predicted scores rescaled
    })

    # Save test result CSV
    test_filename = f"test_result_{i}_{result['method']}_{result['model']}.csv"
    test_df.to_csv(test_filename, index=False)
    print(f"Saved test result: {test_filename}")

Using feature extraction method: averageCosine
Saved linear model using averageCosine feature extraction: trained_model_averageCosine_linear.pkl
Trained regression model saved successfully!
Validation Performance (averageCosine, linear):
MSE: 0.0712, MAE: 0.2242, Pearson Correlation: 0.7013
Test Performance (averageCosine, linear):
MSE: 0.0645, MAE: 0.2127, Pearson Correlation: 0.7465
Saved svr model using averageCosine feature extraction: trained_model_averageCosine_svr.pkl
Trained regression model saved successfully!
Validation Performance (averageCosine, svr):
MSE: 0.0728, MAE: 0.2186, Pearson Correlation: 0.7013
Test Performance (averageCosine, svr):
MSE: 0.0646, MAE: 0.2051, Pearson Correlation: 0.7465
Saved rfr model using averageCosine feature extraction: trained_model_averageCosine_rfr.pkl
Trained regression model saved successfully!
Validation Performance (averageCosine, rfr):
MSE: 0.0266, MAE: 0.1037, Pearson Correlation: 0.8999
Test Performance (averageCosine, rfr):
MSE: 0.0

In [26]:
import pickle
import numpy as np
import pandas as pd
import random
import nltk
from gensim.models import Word2Vec
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Load pre-trained Word2Vec model
word2vec_model_path = "/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model"
embedding = Word2Vec.load(word2vec_model_path)

# Load dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    return [(row['answer'], row['response'], row['label'] / 5.0) for _, row in df.iterrows()]

# Preprocessing
def preprocess_text_indo(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()]
    return " ".join(tokens)

# Feature Extraction Methods
def avg_cosine_similarity(sentences1, sentences2, embedding):
    similarities = []
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [token for token in sent1.split() if token in embedding.wv]
        tokens2 = [token for token in sent2.split() if token in embedding.wv]

        if not tokens1 or not tokens2:
            similarities.append(0)
            continue

        vec1 = np.mean([embedding.wv[token] for token in tokens1], axis=0)
        vec2 = np.mean([embedding.wv[token] for token in tokens2], axis=0)

        similarities.append(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
    return np.array(similarities).reshape(-1, 1)

def word_distance(sentences1, sentences2, embedding):
    similarities = []
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [word for word in sent1.split() if word in embedding.wv]
        tokens2 = [word for word in sent2.split() if word in embedding.wv]

        if not tokens1 or not tokens2:
            similarities.append(0)
        else:
            similarities.append(-embedding.wv.wmdistance(tokens1, tokens2))
    return np.array(similarities).reshape(-1, 1)

def sif_cos(sentences1, sentences2, embedding):
    from sklearn.decomposition import TruncatedSVD

    def eliminate_first_component(matrix):
        svd_model = TruncatedSVD(n_components=1, random_state=42)
        svd_model.fit(matrix)
        principal_component = svd_model.components_
        return matrix - matrix.dot(principal_component.T) * principal_component

    similarities = []
    all_embeddings = []
    
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [word for word in sent1.split() if word in embedding.wv]
        tokens2 = [word for word in sent2.split() if word in embedding.wv]

        if not tokens1 or not tokens2:
            similarities.append(0)
            continue

        embedding1 = np.mean([embedding.wv[word] for word in tokens1], axis=0)
        embedding2 = np.mean([embedding.wv[word] for word in tokens2], axis=0)

        all_embeddings.extend([embedding1, embedding2])

    all_embeddings = np.array(all_embeddings)
    all_embeddings = eliminate_first_component(all_embeddings)

    for i in range(0, len(all_embeddings), 2):
        similarities.append(
            np.dot(all_embeddings[i], all_embeddings[i + 1]) /
            (np.linalg.norm(all_embeddings[i]) * np.linalg.norm(all_embeddings[i + 1]))
        )
    
    return np.array(similarities).reshape(-1, 1)

# Compute Metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    pearson_corr, _ = pearsonr(y_true, y_pred) if len(y_true) > 1 else (0, None)

    print(f"📊 Evaluation Metrics:")
    print(f"✅ MAE: {mae:.4f}, RMSE: {rmse:.4f}, Pearson Correlation: {pearson_corr:.4f}")
    
    return mae, rmse, pearson_corr

# Predict on Unseen Data
def predict_unseen_data(unseen_file_path):
    unseen_data = pd.read_csv(unseen_file_path)

    # Get true labels (if available)
    if "label" in unseen_data.columns:
        y_true = unseen_data["label"].values / 5.0
    else:
        y_true = None

    unseen_sentence1 = [preprocess_text_indo(item) for item in unseen_data["answer"]]
    unseen_sentence2 = [preprocess_text_indo(item) for item in unseen_data["response"]]

    results = []

    for method in ["averageCosine", "wordDis", "sifCos"]:
        if method == "averageCosine":
            unseen_features = avg_cosine_similarity(unseen_sentence1, unseen_sentence2, embedding)
        elif method == "wordDis":
            unseen_features = word_distance(unseen_sentence1, unseen_sentence2, embedding)
        elif method == "sifCos":
            unseen_features = sif_cos(unseen_sentence1, unseen_sentence2, embedding)

        for model_type in ["linear", "svr", "rfr"]:
            model_filename = f"trained_model_{method}_{model_type}.pkl"

            with open(model_filename, "rb") as file:
                model = pickle.load(file)
            
            predictions = model.predict(unseen_features)
            predictions_rescaled = predictions * 5  # Scale predictions to [0, 5]

            # Save Predictions
            df = pd.DataFrame({
                "Original Sentence 1": unseen_data["answer"],
                "Original Sentence 2": unseen_data["response"],
                "True Label": unseen_data["label"] if y_true is not None else "N/A",  # Keep original scale
                "Predicted Similarity Score": predictions_rescaled
            })

            output_filename = f"unseen_predictions_{method}_{model_type}.csv"
            df.to_csv(output_filename, index=False)
            print(f"✅ Saved unseen predictions to {output_filename}")

            # Compute Metrics if True Labels Exist
            if y_true is not None:
                mae, rmse, pearson_corr = compute_metrics(y_true * 5, predictions_rescaled)
                results.append({
                    "Method": method,
                    "Model": model_type,
                    "MAE": mae,
                    "RMSE": rmse,
                    "Pearson": pearson_corr
                })

    # Save All Metrics to CSV
    if results:
        results_df = pd.DataFrame(results)
        results_df.to_csv("unseen_metrics_summary.csv", index=False)
        print("📌 Saved all evaluation metrics to unseen_metrics_summary.csv")

# Run predictions and evaluation on Unseen Data
unseen_file_path = "/kaggle/input/test-data/test-BuIng.csv"
predict_unseen_data(unseen_file_path)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
✅ Saved unseen predictions to unseen_predictions_averageCosine_linear.csv
📊 Evaluation Metrics:
✅ MAE: 1.1494, RMSE: 1.3528, Pearson Correlation: -0.1286
✅ Saved unseen predictions to unseen_predictions_averageCosine_svr.csv
📊 Evaluation Metrics:
✅ MAE: 1.1645, RMSE: 1.3619, Pearson Correlation: -0.1286
✅ Saved unseen predictions to unseen_predictions_averageCosine_rfr.csv
📊 Evaluation Metrics:
✅ MAE: 1.6312, RMSE: 1.9357, Pearson Correlation: -0.1203
✅ Saved unseen predictions to unseen_predictions_wordDis_linear.csv
📊 Evaluation Metrics:
✅ MAE: 1.7677, RMSE: 2.1014, Pearson Correlation: -0.0211
✅ Saved unseen predictions to unseen_predictions_wordDis_svr.csv
📊 Evaluation Metrics:
✅ MAE: 1.9329, RMSE: 2.2524, Pearson Correlation: -0.0211
✅ Saved unseen predictions to unseen_predictions_wordDis_rfr.csv
📊 Evaluation Metrics:
✅ MAE: 1.7786, RMSE: 2.1070, Pearson Correlation

In [29]:
import pickle
import numpy as np
import pandas as pd
import random
import nltk
from gensim.models import Word2Vec
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

# Ensure required NLTK data is downloaded
nltk.download('punkt')

# Load pre-trained Word2Vec model
word2vec_model_path = "/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model"
embedding = Word2Vec.load(word2vec_model_path)

# Load dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    return [(row['answer'], row['response'], row['label'] / 5.0) for _, row in df.iterrows()]

# Preprocessing
def preprocess_text_indo(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    tokens = [stopword_remover.remove(word) for word in tokens if word.isalpha()]
    return " ".join(tokens)

# Feature Extraction Methods
def avg_cosine_similarity(sentences1, sentences2, embedding):
    similarities = []
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [token for token in sent1.split() if token in embedding.wv]
        tokens2 = [token for token in sent2.split() if token in embedding.wv]

        if not tokens1 or not tokens2:
            similarities.append(0)
            continue

        vec1 = np.mean([embedding.wv[token] for token in tokens1], axis=0)
        vec2 = np.mean([embedding.wv[token] for token in tokens2], axis=0)

        similarities.append(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
    return np.array(similarities).reshape(-1, 1)

def word_distance(sentences1, sentences2, embedding):
    similarities = []
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [word for word in sent1.split() if word in embedding.wv]
        tokens2 = [word for word in sent2.split() if word in embedding.wv]

        if not tokens1 or not tokens2:
            similarities.append(0)
        else:
            similarities.append(-embedding.wv.wmdistance(tokens1, tokens2))
    return np.array(similarities).reshape(-1, 1)

def sif_cos(sentences1, sentences2, embedding):
    from sklearn.decomposition import TruncatedSVD

    def eliminate_first_component(matrix):
        svd_model = TruncatedSVD(n_components=1, random_state=42)
        svd_model.fit(matrix)
        principal_component = svd_model.components_
        return matrix - matrix.dot(principal_component.T) * principal_component

    similarities = []
    all_embeddings = []
    
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [word for word in sent1.split() if word in embedding.wv]
        tokens2 = [word for word in sent2.split() if word in embedding.wv]

        if not tokens1 or not tokens2:
            similarities.append(0)
            continue

        embedding1 = np.mean([embedding.wv[word] for word in tokens1], axis=0)
        embedding2 = np.mean([embedding.wv[word] for word in tokens2], axis=0)

        all_embeddings.extend([embedding1, embedding2])

    all_embeddings = np.array(all_embeddings)
    all_embeddings = eliminate_first_component(all_embeddings)

    for i in range(0, len(all_embeddings), 2):
        similarities.append(
            np.dot(all_embeddings[i], all_embeddings[i + 1]) /
            (np.linalg.norm(all_embeddings[i]) * np.linalg.norm(all_embeddings[i + 1]))
        )
    
    return np.array(similarities).reshape(-1, 1)

# Compute Metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    pearson_corr, _ = pearsonr(y_true, y_pred) if len(y_true) > 1 else (0, None)

    print(f"📊 Evaluation Metrics:")
    print(f"✅ MAE: {mae:.4f}, RMSE: {rmse:.4f}, Pearson Correlation: {pearson_corr:.4f}")
    
    return mae, rmse, pearson_corr

# Predict on Unseen Data
def predict_unseen_data(unseen_file_path):
    unseen_data = pd.read_csv(unseen_file_path)

    # Get true labels (if available)
    if "label" in unseen_data.columns:
        y_true = unseen_data["label"].values / 5.0
    else:
        y_true = None

    unseen_sentence1 = [preprocess_text_indo(item) for item in unseen_data["answer"]]
    unseen_sentence2 = [preprocess_text_indo(item) for item in unseen_data["response"]]

    results = []

    for method in ["averageCosine", "wordDis", "sifCos"]:
        if method == "averageCosine":
            unseen_features = avg_cosine_similarity(unseen_sentence1, unseen_sentence2, embedding)
        elif method == "wordDis":
            unseen_features = word_distance(unseen_sentence1, unseen_sentence2, embedding)
        elif method == "sifCos":
            unseen_features = sif_cos(unseen_sentence1, unseen_sentence2, embedding)

        for model_type in ["linear", "svr", "rfr"]:
            model_filename = f"trained_model_{method}_{model_type}.pkl"

            with open(model_filename, "rb") as file:
                model = pickle.load(file)
            
            predictions = model.predict(unseen_features)
            predictions_rescaled = predictions * 5  # Scale predictions to [0, 5]

            # Save Predictions
            df = pd.DataFrame({
                "Original Sentence 1": unseen_data["answer"],
                "Original Sentence 2": unseen_data["response"],
                "True Label": unseen_data["label"] if y_true is not None else "N/A",  # Keep original scale
                "Predicted Similarity Score": predictions_rescaled
            })

            output_filename = f"unseen_predictions_{method}_{model_type}.csv"
            df.to_csv(output_filename, index=False)
            print(f"✅ Saved unseen predictions to {output_filename}")

            # Compute Metrics if True Labels Exist
            if y_true is not None:
                mae, rmse, pearson_corr = compute_metrics(y_true * 5, predictions_rescaled)
                results.append({
                    "Method": method,
                    "Model": model_type,
                    "MAE": mae,
                    "RMSE": rmse,
                    "Pearson": pearson_corr
                })

    # Save All Metrics to CSV
    if results:
        results_df = pd.DataFrame(results)
        results_df.to_csv("unseen_metrics_summary.csv", index=False)
        print("📌 Saved all evaluation metrics to unseen_metrics_summary.csv")

# Run predictions and evaluation on Unseen Data
unseen_file_path = "/kaggle/input/test-data/test-BuIng.csv"
predict_unseen_data(unseen_file_path)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
✅ Saved unseen predictions to unseen_predictions_averageCosine_linear.csv
📊 Evaluation Metrics:
✅ MAE: 1.4315, RMSE: 1.7809, Pearson Correlation: 0.0445
✅ Saved unseen predictions to unseen_predictions_averageCosine_svr.csv
📊 Evaluation Metrics:
✅ MAE: 1.5109, RMSE: 1.8687, Pearson Correlation: 0.0445
✅ Saved unseen predictions to unseen_predictions_averageCosine_rfr.csv
📊 Evaluation Metrics:
✅ MAE: 2.0459, RMSE: 2.3760, Pearson Correlation: 0.0661
✅ Saved unseen predictions to unseen_predictions_wordDis_linear.csv
📊 Evaluation Metrics:
✅ MAE: 1.9742, RMSE: 2.2705, Pearson Correlation: 0.1035
✅ Saved unseen predictions to unseen_predictions_wordDis_svr.csv
📊 Evaluation Metrics:
✅ MAE: 2.1563, RMSE: 2.4419, Pearson Correlation: 0.1035
✅ Saved unseen predictions to unseen_predictions_wordDis_rfr.csv
📊 Evaluation Metrics:
✅ MAE: 2.6015, RMSE: 3.2505, Pearson Correlation: -0.

In [27]:
import pickle
import pandas as pd
import numpy as np
import nltk
from gensim.models import Word2Vec
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

# Load pre-trained Word2Vec model
word2vec_model_path = "/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model"
embedding = Word2Vec.load(word2vec_model_path)

# Load trained regression model
with open("/kaggle/working/trained_model_sifCos_rfr.pkl", "rb") as file:
    model = pickle.load(file)

print("Trained regression model loaded successfully!")

# ====== Text Preprocessing Function ======
def preprocess_text_indo(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    tokens = [stopword_remover.remove(word) for word in tokens if word.isalpha()]
    return " ".join(tokens)

# ====== Cosine Similarity Feature Extraction ======
def avg_cosine_similarity(sentences1, sentences2, embedding):
    similarities = []
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [token for token in sent1.split() if token in embedding.wv]
        tokens2 = [token for token in sent2.split() if token in embedding.wv]
        
        if not tokens1 or not tokens2:
            similarities.append(0)
            continue
        
        vec1 = np.mean([embedding.wv[token] for token in tokens1], axis=0).reshape(1, -1)
        vec2 = np.mean([embedding.wv[token] for token in tokens2], axis=0).reshape(1, -1)
        
        similarities.append(np.dot(vec1.flatten(), vec2.flatten()) / 
                            (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
    return np.array(similarities).reshape(-1, 1)

# ====== Compute Metrics ======
def compute_metrics(y_true, y_pred):
    """
    Compute MAE, RMSE, and Pearson correlation.
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    pearson_corr, _ = pearsonr(y_true, y_pred)

    print(f"\n📊 Evaluation Metrics on Unseen Data:")
    print(f"✅ Mean Absolute Error (MAE): {mae:.4f}")
    print(f"✅ Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"✅ Pearson Correlation: {pearson_corr:.4f}")

    return mae, rmse, pearson_corr

# ====== Predict on Unseen Data and Compute Metrics ======
def predict_unseen_data(model, embedding, unseen_file_path, output_file):
    """
    Predict similarity scores on an unseen dataset and compute evaluation metrics.
    """
    unseen_data = pd.read_csv(unseen_file_path)

    # Check if labels are available for evaluation
    if "label" in unseen_data.columns:
        y_true = unseen_data["label"].values / 5.0  # Normalize labels to [0, 1]
    else:
        y_true = None

    # Preprocess unseen dataset
    unseen_sentence1 = [preprocess_text_indo(item) for item in unseen_data["answer"]]
    unseen_sentence2 = [preprocess_text_indo(item) for item in unseen_data["response"]]

    # Extract features using Word2Vec
    unseen_features = avg_cosine_similarity(unseen_sentence1, unseen_sentence2, embedding)

    # Predict similarity scores
    predictions = model.predict(unseen_features)

    # Save predictions to CSV
    df = pd.DataFrame({
        "Original Sentence 1": unseen_data["answer"],
        "Original Sentence 2": unseen_data["response"],
        "True Label": unseen_data["label"] if y_true is not None else "N/A",  # Keep original scale
        "Predicted Similarity Score": predictions * 5  # Rescale to [0, 5]
    })

    # Include true labels if available
    if y_true is not None:
        df["True Similarity Score"] = unseen_data["label"]  # Keep original scale [0, 5]
    
    df.to_csv(output_file, index=False)
    print(f"✅ Saved predictions to {output_file}")

    # Compute metrics if true labels are available
    if y_true is not None:
        y_true_rescaled = y_true *5  # Rescale true scores back to [0, 5]
        predictions_rescaled = predictions * 5  # Rescale predictions to [0, 5]
        return compute_metrics(y_true_rescaled, predictions_rescaled)
    else:
        print("⚠️ True labels not available. Skipping metric calculations.")
        return None, None, None

# ====== Run Prediction on Unseen Data ======
unseen_file_path = "/kaggle/input/test-data/test-BuIng.csv"
output_file = "trial-unseen_test_predictions.csv"

print("\n🚀 Predicting on Unseen Dataset...")
mae, rmse, pearson_corr = predict_unseen_data(model, embedding, unseen_file_path, output_file)


Trained regression model loaded successfully!

🚀 Predicting on Unseen Dataset...
✅ Saved predictions to trial-unseen_test_predictions.csv

📊 Evaluation Metrics on Unseen Data:
✅ Mean Absolute Error (MAE): 1.1005
✅ Root Mean Squared Error (RMSE): 1.3423
✅ Pearson Correlation: 0.1462


In [28]:
import pickle
import pandas as pd
import numpy as np
import nltk
from gensim.models import Word2Vec
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

# Load pre-trained Word2Vec model
word2vec_model_path = "/kaggle/input/id-w2v-model/pytorch/default/2/idwiki_word2vec_200_new_lower.model"
embedding = Word2Vec.load(word2vec_model_path)

# Load trained regression model
with open("/kaggle/working/trained_model_sifCos_svr.pkl", "rb") as file:
    model = pickle.load(file)

print("Trained regression model loaded successfully!")

# ====== Text Preprocessing Function ======
def preprocess_text_indo(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    tokens = [stopword_remover.remove(word) for word in tokens if word.isalpha()]
    return " ".join(tokens)

# ====== Cosine Similarity Feature Extraction ======
def avg_cosine_similarity(sentences1, sentences2, embedding):
    similarities = []
    for sent1, sent2 in zip(sentences1, sentences2):
        tokens1 = [token for token in sent1.split() if token in embedding.wv]
        tokens2 = [token for token in sent2.split() if token in embedding.wv]
        
        if not tokens1 or not tokens2:
            similarities.append(0)
            continue
        
        vec1 = np.mean([embedding.wv[token] for token in tokens1], axis=0).reshape(1, -1)
        vec2 = np.mean([embedding.wv[token] for token in tokens2], axis=0).reshape(1, -1)
        
        similarities.append(np.dot(vec1.flatten(), vec2.flatten()) / 
                            (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
    return np.array(similarities).reshape(-1, 1)

# ====== Compute Metrics ======
def compute_metrics(y_true, y_pred):
    """
    Compute MAE, RMSE, and Pearson correlation.
    """
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    pearson_corr, _ = pearsonr(y_true, y_pred)

    print(f"\n📊 Evaluation Metrics on Unseen Data:")
    print(f"✅ Mean Absolute Error (MAE): {mae:.4f}")
    print(f"✅ Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"✅ Pearson Correlation: {pearson_corr:.4f}")

    return mae, rmse, pearson_corr

# ====== Predict on Unseen Data and Compute Metrics ======
def predict_unseen_data(model, embedding, unseen_file_path, output_file):
    """
    Predict similarity scores on an unseen dataset and compute evaluation metrics.
    """
    unseen_data = pd.read_csv(unseen_file_path)

    # Check if labels are available for evaluation
    if "label" in unseen_data.columns:
        y_true = unseen_data["label"].values / 5.0  # Normalize labels to [0, 1]
    else:
        y_true = None

    # Preprocess unseen dataset
    unseen_sentence1 = [preprocess_text_indo(item) for item in unseen_data["answer"]]
    unseen_sentence2 = [preprocess_text_indo(item) for item in unseen_data["response"]]

    # Extract features using Word2Vec
    unseen_features = word_distance(unseen_sentence1, unseen_sentence2, embedding)

    # Predict similarity scores
    predictions = model.predict(unseen_features)

    # Save predictions to CSV
    df = pd.DataFrame({
        "Original Sentence 1": unseen_data["answer"],
        "Original Sentence 2": unseen_data["response"],
        "True Label": unseen_data["label"] if y_true is not None else "N/A",  # Keep original scale
        "Predicted Similarity Score": predictions * 5  # Rescale to [0, 5]
    })

    # Include true labels if available
    if y_true is not None:
        df["True Similarity Score"] = unseen_data["label"]  # Keep original scale [0, 5]
    
    df.to_csv(output_file, index=False)
    print(f"✅ Saved predictions to {output_file}")

    # Compute metrics if true labels are available
    if y_true is not None:
        y_true_rescaled = y_true *5  # Rescale true scores back to [0, 5]
        predictions_rescaled = predictions * 5  # Rescale predictions to [0, 5]
        return compute_metrics(y_true_rescaled, predictions_rescaled)
    else:
        print("⚠️ True labels not available. Skipping metric calculations.")
        return None, None, None

# ====== Run Prediction on Unseen Data ======
unseen_file_path = "/kaggle/input/test-data/test-BuIng.csv"
output_file = "trial-unseen_test_predictions.csv"

print("\n🚀 Predicting on Unseen Dataset...")
mae, rmse, pearson_corr = predict_unseen_data(model, embedding, unseen_file_path, output_file)


Trained regression model loaded successfully!

🚀 Predicting on Unseen Dataset...
✅ Saved predictions to trial-unseen_test_predictions.csv

📊 Evaluation Metrics on Unseen Data:
✅ Mean Absolute Error (MAE): 7.8196
✅ Root Mean Squared Error (RMSE): 7.9048
✅ Pearson Correlation: 0.1035
