In [None]:
# Install Required Packages
!pip install numpy
!pip install pandas
!pip install lightgbm
!pip install xgboost
!pip install catboost
!pip install scikit-learn
!pip install nltk
!pip install rapidfuzz
!pip install tensorflow
!pip install gensim
!pip install sentence-transformers
!pip install spaCy
# Add more libraries as needed
# !pip install <library_name>

In [15]:
# Import Libraries
import string

import numpy as np
import pandas as pd
import spacy
# import nltk
# from nltk.corpus import stopwords, wordnet
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

from rapidfuzz.distance import Levenshtein

# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Input, Lambda, Embedding, Bidirectional, GlobalMaxPool1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow.keras.backend as K

# Download NLTK Resources
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')


In [4]:
# Text Preprocessing Class


class TextPreprocessor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")  # Load spaCy's small English model

    def preprocess(self, text):
        text = text.lower()
        doc = self.nlp(text)  # Process text with spaCy
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        return ' '.join(tokens)

In [5]:
#get similarity features
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine, cityblock

class SemanticSimilarity:
    def __init__(self, model_name="all-MiniLM-L6-v2", device='cuda'):
        """Initialize the SentenceTransformer model."""
        self.model = SentenceTransformer(model_name)

    def get_embedding(self, text):
        """Compute and return the sentence embedding for a given text."""
        return self.model.encode(text, convert_to_numpy=True)

    def cosine_similarity(self, vec1, vec2):
        """Compute Cosine Similarity between two embeddings."""
        return 1 - cosine(vec1, vec2)

    def manhattan_distance(self, vec1, vec2):
        """Compute Manhattan Distance (L1 norm) between two embeddings."""
        return cityblock(vec1, vec2)

    def get_all_features(self, text1, text2):
        """Compute and return both similarity features using precomputed embeddings."""
        vec1 = self.get_embedding(text1)
        vec2 = self.get_embedding(text2)

        return {
            "cosine_similarity": self.cosine_similarity(vec1, vec2),
            "manhattan_distance": self.manhattan_distance(vec1, vec2),
        }



In [16]:
# Similarity Model Class
class SimilarityModel:
    def __init__(self, model_type='lightgbm'):
        self.model_type = model_type
        self.model = None

    def train(self, X_train, y_train, raw_text_pairs=None):
        if self.model_type in ['lightgbm', 'xgboost', 'random_forest', 'svm', 'logistic_regression', 'catboost']:
            if self.model_type == 'lightgbm':
                self.model = lgb.LGBMClassifier()
            elif self.model_type == 'xgboost':
                self.model = xgb.XGBClassifier()
            elif self.model_type == 'random_forest':
                self.model = RandomForestClassifier()
            elif self.model_type == 'svm':
                self.model = SVC(probability=True)
            elif self.model_type == 'logistic_regression':
                self.model = LogisticRegression()
            elif self.model_type == 'catboost':
                self.model = cb.CatBoostClassifier(verbose=0)
            self.model.fit(X_train, y_train)

        elif self.model_type == 'neural_network':
            self.model = Sequential([
                Dense(64, input_dim=X_train.shape[1], activation='relu'),
                Dense(32, activation='relu'),
                Dense(1, activation='sigmoid')
            ])
            self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            self.model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

        elif self.model_type == 'lstm':
            tokenizer = Tokenizer()
            texts = [pair[0] + ' ' + pair[1] for pair in raw_text_pairs]
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences, maxlen=100)

            self.model = Sequential([
                Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=100),
                Bidirectional(LSTM(64)),
                Dense(1, activation='sigmoid')
            ])
            self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
            self.model.fit(padded_sequences, y_train, epochs=10, batch_size=32, verbose=0)

        elif self.model_type == 'siamese':
          device = '/device:CPU:0'
          with tf.device(device):
                # Tokenization and padding
              tokenizer = Tokenizer()
              texts = [pair[0] for pair in raw_text_pairs] + [pair[1] for pair in raw_text_pairs]
              tokenizer.fit_on_texts(texts)
              seqs_a = pad_sequences(tokenizer.texts_to_sequences([pair[0] for pair in raw_text_pairs]), maxlen=100)
              seqs_b = pad_sequences(tokenizer.texts_to_sequences([pair[1] for pair in raw_text_pairs]), maxlen=100)

              # Define the two input layers
              input_a = Input(shape=(100,))
              input_b = Input(shape=(100,))

              # Embedding layer
              embedding_layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=100)
              encoded_a = embedding_layer(input_a)
              encoded_b = embedding_layer(input_b)

              # Shared LSTM layer with Dropout
              shared_lstm = LSTM(64,
                                activation='tanh',
                                recurrent_activation='sigmoid',
                                use_bias=True,
                                return_sequences=False,
                                stateful=False,
                                implementation=1,
                                kernel_regularizer=regularizers.l2(0.01))  # L2 Regularization added

              processed_a = shared_lstm(encoded_a)
              processed_b = shared_lstm(encoded_b)

              # Add Dropout layer after LSTM to prevent overfitting
              dropout = Dropout(0.5)  # Dropout rate set to 50%
              processed_a = dropout(processed_a)
              processed_b = dropout(processed_b)

              # Compute the distance between the two processed outputs
              distance = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))([processed_a, processed_b])
              output = Dense(1, activation='sigmoid')(distance)

              # Compile the model
              self.model = Model(inputs=[input_a, input_b], outputs=output)
              self.model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

              # Implement EarlyStopping to stop training when validation accuracy stops improving
              early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

              # Fit the model with early stopping
              self.model.fit([seqs_a, seqs_b], y_train, epochs=10, batch_size=32, verbose=1, validation_split=0.2, callbacks=[early_stopping])

        else:
            raise ValueError(f"Unsupported model type: {self.model_type}")

    def predict(self, X_test, raw_text_pairs=None):
        if self.model_type in ['neural_network']:
            return (self.model.predict(X_test) > 0.5).astype("int32")
        elif self.model_type in ['lstm', 'siamese']:
            tokenizer = Tokenizer()
            if self.model_type == 'lstm':
                texts = [pair[0] + ' ' + pair[1] for pair in raw_text_pairs]
                sequences = tokenizer.texts_to_sequences(texts)
                padded_sequences = pad_sequences(sequences, maxlen=100)
                return (self.model.predict(padded_sequences) > 0.5).astype("int32")
            elif self.model_type == 'siamese':
                texts = [pair[0] for pair in raw_text_pairs] + [pair[1] for pair in raw_text_pairs]
                tokenizer.fit_on_texts(texts)
                seqs_a = pad_sequences(tokenizer.texts_to_sequences([pair[0] for pair in raw_text_pairs]), maxlen=100)
                seqs_b = pad_sequences(tokenizer.texts_to_sequences([pair[1] for pair in raw_text_pairs]), maxlen=100)
                return (self.model.predict([seqs_a, seqs_b]) > 0.5).astype("int32")
        return self.model.predict(X_test)

    def evaluate(self, X_test, y_test, raw_text_pairs=None):
        y_pred = self.predict(X_test, raw_text_pairs)
        return accuracy_score(y_test, y_pred)

In [8]:
# Load Data
with open('/content/msr_paraphrase_train.txt', 'r') as file:
    data = file.readlines()

    data1 = []
    for item in data:
        x = item.split('\t')
        data1.append(x)
    data1 = data1[1:]

# Preprocessing
preprocessor = TextPreprocessor()
features = SemanticSimilarity()

processed_data = []
raw_text_pairs = []

for label, _, _, text1, text2 in data1:

    text1_processed, text2_processed = preprocessor.preprocess(text1), preprocessor.preprocess(text2)
    feature_vector = features.get_all_features(text1_processed, text2_processed)
    feature_vector['label'] = int(label)
    processed_data.append(feature_vector)
    raw_text_pairs.append((text1, text2))

# Prepare DataFrame
df = pd.DataFrame(processed_data)
X = df.drop(columns=['label'])
y = df['label']

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_raw_pairs, test_raw_pairs = train_test_split(raw_text_pairs, test_size=0.2, random_state=42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
# Train and Evaluate Model
model = SimilarityModel('xgboost')  # Change model type to test different models
model.train(X_train, y_train, raw_text_pairs=train_raw_pairs)

print("Accuracy:", model.evaluate(X_test, y_test, raw_text_pairs=test_raw_pairs))

Accuracy: 0.7316176470588235


In [None]:
X_train

In [None]:
model.predict(X_test)

In [19]:
# Train and Evaluate Model
model = SimilarityModel('lightgbm')  # Change model type to test different models
model.train(X_train, y_train, raw_text_pairs=train_raw_pairs)

print("Accuracy:", model.evaluate(X_test, y_test, raw_text_pairs=test_raw_pairs))

[LightGBM] [Info] Number of positive: 2177, number of negative: 1083
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000557 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 3260, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.667791 -> initscore=0.698213
[LightGBM] [Info] Start training from score 0.698213
Accuracy: 0.7377450980392157


In [17]:
# Train and Evaluate Model
model = SimilarityModel('siamese')
model.train(X_train, y_train, raw_text_pairs=train_raw_pairs)

print("Accuracy:", model.evaluate(X_test, y_test, raw_text_pairs=test_raw_pairs))

Epoch 1/10




[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 235ms/step - accuracy: 0.6543 - loss: 1.6038 - val_accuracy: 0.6933 - val_loss: 0.8352
Epoch 2/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 254ms/step - accuracy: 0.6610 - loss: 0.7219 - val_accuracy: 0.6933 - val_loss: 0.6837
Epoch 3/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 255ms/step - accuracy: 0.6798 - loss: 0.5353 - val_accuracy: 0.6840 - val_loss: 0.6658
Epoch 4/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 252ms/step - accuracy: 0.7800 - loss: 0.5508 - val_accuracy: 0.6779 - val_loss: 0.6946
Epoch 5/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 246ms/step - accuracy: 0.7857 - loss: 0.4350 - val_accuracy: 0.6871 - val_loss: 0.6763
Epoch 6/10
[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 223ms/step - accuracy: 0.9480 - loss: 0.2751 - val_accuracy: 0.6104 - val_loss: 0.7098
[1m26/26[0m [32m━━━━━━━━━━━━━━

In [21]:
# Train and Evaluate Model
model = SimilarityModel('svm')  # Change model type to test different models
model.train(X_train, y_train, raw_text_pairs=train_raw_pairs)

print("Accuracy:", model.evaluate(X_test, y_test, raw_text_pairs=test_raw_pairs))

Accuracy: 0.7598039215686274
