In [76]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import numpy as np
import joblib  # For saving models
from spacy.compat import pickle




In [77]:
data_path = "/Users/aswathshakthi/PycharmProjects/MLOps/3.Semantic-Textual-Similarity-NLP/data/clinic_c.csv"
df = pd.read_csv(data_path)
df.head()


Unnamed: 0,Sent1,Sent2,Score
0,Insulin NPH Human [NOVOLIN N] 100 unit/mL suspension subcutaneous as directed by prescriber.,Insulin NPH Human [NOVOLIN N] 100 unit/mL suspension 63-76 units subcutaneous as directed by prescriber.,3.5
1,"Patient arrives ambulatory, Gait steady, History obtained from patient, Patient appears comfortable, Patient cooperative, alert, Skin warm.","Complex assessment performed, Patient arrives ambulatory, Gait steady, History obtained from patient, Patient appears, anxious, in distress due to pain, Patient cooperative, alert, Oriented to person, place and time.",2.5
2,"Peripheral IV site, established in the right forearm, using an 18 gauge catheter, in one attempt.","Peripheral IV site, present prior to arrival, established in the right hand, using a 20 gauge catheter.",3.45
3,"No: new confusion or inability to stay alert and awake; currently struggling to breathe, even while inactive or resting; currently feeling like you are going to collapse every time you stand (sit); vomit that looks like ground coffee; vomiting blood; uncontrollable or continuous rectal bleeding; black, sticky, tar-like stools; heavy vaginal bleeding or purple or red rash/blotches that stay when pressed by a glass (purpural rash)","No: new confusion or inability to stay alert and awake; any chest pain or discomfort; sudden, CURRENTand excruciating ripping or stabbing pain characteristic of a dissecting aortic aneurysm; newly painful or blue toes on one side; currently struggling to breathe, even while inactive or resting; currently feeling like you are going to collapse every time you stand (sit); vomit that looks like ground coffee; vomiting blood; uncontrollable or continuous rectal bleeding; black, sticky, tar-like stools; heavy vaginal bleeding or purple or red rash/blotches that stay when pressed by a glass (purpural rash)",4.0
4,Spent 15 minutes with the patient and greater than 50% of this time was spent counseling the patient regarding diagnosis and available treatment options.,"Nurse visit ten minutes, over half of which was spent in counseling and point-of-care testing.",3.0


In [78]:
# Split into train/test sets
train_num = int(df.shape[0]*0.9)

train = df[["Sent1","Sent2"]][:train_num]
train_labels = df["Score"][:train_num]

dev= df[["Sent1","Sent2"]][train_num:]
dev_labels = df["Score"][train_num:]


In [79]:
# Combine Sent1 and Sent2 for unified vectorization
vectorizer = TfidfVectorizer()
vectorizer.fit(train["Sent1"].tolist()+train["Sent2"].tolist())




In [80]:
# Transform train and test sets
train_vecs1 = vectorizer.transform(train.Sent1.tolist())
train_vecs2 = vectorizer.transform(train.Sent2.tolist())

dev_vecs1 = vectorizer.transform(dev.Sent1.tolist())
dev_vecs2 = vectorizer.transform(dev.Sent2.tolist())


# Store feature names for analysis
features = vectorizer.get_feature_names_out()
print(f"TF-IDF Vocabulary Size: {len(features)}")

TF-IDF Vocabulary Size: 2651


In [81]:

train_features = np.hstack([train_vecs1.toarray(), train_vecs2.toarray()])
test_features = np.hstack([dev_vecs1.toarray(), dev_vecs2.toarray()])


def calculate_cosine_similarity(vecs1, vecs2):
    return np.array([cosine_similarity(vecs1[i], vecs2[i])[0][0] for i in range(vecs1.shape[0])])

train_cosine_sim = calculate_cosine_similarity(train_vecs1, train_vecs2)
dev_cosine_sim = calculate_cosine_similarity(dev_vecs1, dev_vecs2)

# Append cosine similarity as a feature
train_features = np.hstack([train_vecs1.toarray(), train_vecs2.toarray(), train_cosine_sim.reshape(-1, 1)])
dev_features = np.hstack([dev_vecs1.toarray(), dev_vecs2.toarray(), dev_cosine_sim.reshape(-1, 1)])


In [82]:

import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 100, 1000),
        max_depth=trial.suggest_int('max_depth', 5, 50),
        min_samples_split=trial.suggest_int('min_samples_split', 2, 20),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 10),
        max_features=trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    )
    scores = cross_val_score(rf, train_features, train_labels, cv=5, scoring='neg_mean_squared_error')
    return -scores.mean()

study = optuna.create_study(direction='minimize',study_name="Similarity", storage="sqlite:///db.sqlite3")
study.optimize(objective, n_trials=50,n_jobs=-1)

print("Best Parameters:", study.best_params)
best_model = RandomForestRegressor(**study.best_params)

best_model.fit(train_features, train_labels)


# Save the trained models for later inference
joblib.dump(best_model, '/Users/aswathshakthi/PycharmProjects/MLOps/3.Semantic-Textual-Similarity-NLP/models/tfidf_rf_model.pkl')


print("Models saved successfully!")


[I 2024-12-19 14:55:07,369] A new study created in RDB with name: Similarity
[I 2024-12-19 14:55:13,041] Trial 2 finished with value: 1.2921191504365437 and parameters: {'n_estimators': 149, 'max_depth': 10, 'min_samples_split': 11, 'min_samples_leaf': 3, 'max_features': 'log2'}. Best is trial 2 with value: 1.2921191504365437.
[I 2024-12-19 14:55:17,540] Trial 7 finished with value: 1.1653630957541776 and parameters: {'n_estimators': 232, 'max_depth': 49, 'min_samples_split': 8, 'min_samples_leaf': 2, 'max_features': 'log2'}. Best is trial 7 with value: 1.1653630957541776.
[I 2024-12-19 14:55:17,974] Trial 3 finished with value: 1.316150045485443 and parameters: {'n_estimators': 305, 'max_depth': 31, 'min_samples_split': 19, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 7 with value: 1.1653630957541776.
[I 2024-12-19 14:55:18,890] Trial 0 finished with value: 0.9892361795474635 and parameters: {'n_estimators': 217, 'max_depth': 15, 'min_samples_split': 7, 'min_samples_l

Best Parameters: {'n_estimators': 533, 'max_depth': 43, 'min_samples_split': 14, 'min_samples_leaf': 6, 'max_features': None}
Models saved successfully!


In [83]:
# Predict using Random Forest models
model = joblib.load("/Users/aswathshakthi/PycharmProjects/MLOps/3.Semantic-Textual-Similarity-NLP/models/tfidf_rf_model.pkl")
dev_pred = model.predict(dev_features)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(dev_labels, dev_pred)

mse

0.4330451743295736

In [84]:
# Correlation for Train Data
train_corr, _ = pearsonr(train_cosine_sim, train_labels)
print(f"Pearson Correlation for Train: {train_corr:.5f}")

# Correlation for Test Data
test_corr, _ = pearsonr(dev_cosine_sim, dev_labels)
print(f"Pearson Correlation for Test: {test_corr:.5f}")


Pearson Correlation for Train: 0.69718
Pearson Correlation for Test: 0.72148


In [85]:
results = {
    "mse1": mse1,

    "train_corr": train_corr,
    "test_corr": test_corr,
}
print("Results:", results)


Results: {'mse1': 0.4340256695387805, 'train_corr': 0.6971838627918355, 'test_corr': 0.72147900961106}


In [86]:
print(vectorizer.vocabulary_)


{'insulin': 1251, 'nph': 1586, 'human': 1147, 'novolin': 1584, '100': 8, 'unit': 2484, 'ml': 1484, 'suspension': 2290, 'subcutaneous': 2260, 'as': 222, 'directed': 711, 'by': 376, 'prescriber': 1806, 'patient': 1712, 'arrives': 216, 'ambulatory': 152, 'gait': 1019, 'steady': 2220, 'history': 1127, 'obtained': 1605, 'from': 1006, 'appears': 194, 'comfortable': 488, 'cooperative': 575, 'alert': 133, 'skin': 2135, 'warm': 2572, 'peripheral': 1732, 'iv': 1299, 'site': 2127, 'established': 850, 'in': 1187, 'the': 2353, 'right': 2019, 'forearm': 988, 'using': 2515, 'an': 157, '18': 27, 'gauge': 1022, 'catheter': 414, 'one': 1626, 'attempt': 245, 'no': 1564, 'new': 1558, 'confusion': 532, 'or': 1643, 'inability': 1188, 'to': 2392, 'stay': 2219, 'and': 160, 'awake': 263, 'currently': 617, 'struggling': 2254, 'breathe': 355, 'even': 853, 'while': 2601, 'inactive': 1190, 'resting': 2000, 'feeling': 929, 'like': 1362, 'you': 2645, 'are': 207, 'going': 1054, 'collapse': 479, 'every': 857, 'time': 