# 3.3.1. TF-IDF with Random Forest for Sentence Similarity

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import numpy as np


In [25]:
data_path = "/Users/aswathshakthi/PycharmProjects/MLOps/Semantic Analysis/ClinicalSTS/clinic_c.csv"
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Sent1,Sent2,Score
0,Insulin NPH Human [NOVOLIN N] 100 unit/mL susp...,Insulin NPH Human [NOVOLIN N] 100 unit/mL sus...,3.5
1,"Patient arrives ambulatory, Gait steady, Hist...","Complex assessment performed, Patient arrives...",2.5
2,"Peripheral IV site, established in the right ...","Peripheral IV site, present prior to arrival,...",3.45
3,No: new confusion or inability to stay alert ...,No: new confusion or inability to stay alert ...,4.0
4,Spent 15 minutes with the patient and greater ...,"Nurse visit ten minutes, over half of which w...",3.0


In [26]:
# Split into train/test sets
train_text1 = df["Sent1"][:750].tolist()
train_text2 = df["Sent2"][:750].tolist()
train_labels = df["Score"][:750].tolist()

test_text1 = df["Sent1"][750:].tolist()
test_text2 = df["Sent2"][750:].tolist()
test_labels = df["Score"][750:].tolist()


In [27]:
# Combine Sent1 and Sent2 for unified vectorization
vectorizer = TfidfVectorizer()
vectorizer.fit(df["Sent1"].tolist() + df["Sent2"].tolist())

# Transform train and test sets
train_vecs1 = vectorizer.transform(train_text1)
train_vecs2 = vectorizer.transform(train_text2)
test_vecs1 = vectorizer.transform(test_text1)
test_vecs2 = vectorizer.transform(test_text2)

# Store feature names for analysis
features = vectorizer.get_feature_names_out()
print(f"TF-IDF Vocabulary Size: {len(features)}")


TF-IDF Vocabulary Size: 2727


In [28]:
# Train on vectorized Sent1 and Sent2
regressor1 = RandomForestRegressor(max_depth=6).fit(train_vecs1, train_labels)
regressor2 = RandomForestRegressor(max_depth=6).fit(train_vecs2, train_labels)


In [29]:
# Predict using Random Forest models
test_pred1 = regressor1.predict(test_vecs1)
test_pred2 = regressor2.predict(test_vecs2)

# Calculate Mean Squared Error (MSE)
mse1 = mean_squared_error(test_labels, test_pred1)
mse2 = mean_squared_error(test_labels, test_pred2)

print(f"MSE for Sentence 1: {mse1}")
print(f"MSE for Sentence 2: {mse2}")


MSE for Sentence 1: 1.0213239593606231
MSE for Sentence 2: 0.9275877638890987


In [30]:
def calculate_cosine_similarity(vecs1, vecs2):
    """Calculate cosine similarity for paired vectors."""
    similarities = []
    for i in range(vecs1.shape[0]):
        similarity = cosine_similarity(vecs1[i], vecs2[i])[0][0]
        similarities.append(similarity)
    return similarities

# Compute cosine similarity for train and test sets
cosine_sim_train = calculate_cosine_similarity(train_vecs1, train_vecs2)
cosine_sim_test = calculate_cosine_similarity(test_vecs1, test_vecs2)

print(f"Sample Train Cosine Similarities: {cosine_sim_train[:5]}")
print(f"Sample Test Cosine Similarities: {cosine_sim_test[:5]}")


Sample Train Cosine Similarities: [0.8396094022614857, 0.5750711628121535, 0.5882593452211883, 0.839443924863119, 0.3368920240532798]
Sample Test Cosine Similarities: [0.5541670279829262, 0.36825124025781364, 0.5574139946235234, 0.5077581484948848, 0.48845172585655505]


In [31]:
# Correlation for Train Data
train_corr, _ = pearsonr(cosine_sim_train, train_labels)
print(f"Pearson Correlation for Train: {train_corr:.5f}")

# Correlation for Test Data
test_corr, _ = pearsonr(cosine_sim_test, test_labels)
print(f"Pearson Correlation for Test: {test_corr:.5f}")


Pearson Correlation for Train: 0.70122
Pearson Correlation for Test: 0.69444


In [32]:
results = {
    "mse1": mse1,
    "mse2": mse2,
    "cosine_sim_train": cosine_sim_train,
    "cosine_sim_test": cosine_sim_test,
    "train_corr": train_corr,
    "test_corr": test_corr,
}
print("Results:", results)


Results: {'mse1': 1.0213239593606231, 'mse2': 0.9275877638890987, 'cosine_sim_train': [0.8396094022614857, 0.5750711628121535, 0.5882593452211883, 0.839443924863119, 0.3368920240532798, 0.25413667602695994, 0.6260484071947241, 0.4622553223300129, 0.13073176255623903, 0.3964403942544895, 0.8540538107427602, 0.1877001637453363, 0.549050370690251, 0.7484285399092123, 0.6293278142055962, 0.8796537402696147, 0.44119218881909134, 0.8911868600747127, 0.8288968862439886, 0.7898966593262988, 0.8103804558921505, 0.7649435148095886, 0.7157503905535202, 0.6230337217800407, 0.9352401021285544, 0.717099370249957, 0.16632372832959721, 0.7689755667792422, 0.9198587141677297, 0.9610384071846098, 0.9549726938519173, 0.9639287005588304, 0.47029825641170064, 0.6258231348949984, 0.31925838682180946, 0.5259181849798034, 0.07128633878165501, 0.6492398484540185, 0.6454485784771292, 0.48977444255685343, 0.649279711891129, 0.16151107335714754, 0.6521457023357632, 0.3480071319150888, 0.10948844247297101, 0.42000