In [1]:
# WSD using clustering (unsupervised learning)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
sentences = [
    "He sat on the bank of the river.",
    "He deposited cash into the bank.",
    "The river overflowed its bank.",
    "She applied for a loan at the bank.",
    "The pilot made a sharp bank to the left before landing.",
    "He worked hard to bank enough money for retirement.",
    "They had a picinc on the grassy bank under the willow tree.",
    "The data is tored in the memory bank within the processor"
]

target_word = "bank"
contexts = []

window_size = 5     # Number of words before and after the tarfet to be included

for sent in sentences:
    words = sent.lower().split()
    indices = [i for i, w in enumerate(words) if w.strip(".,")==target_word]
    for idx in indices:
        start = max(idx - window_size, 0)     # take upto 5 words before 'bank'
        end = min(idx + window_size + 1, len(words))    # take upto 5 words after 'bank'
        context = words[start:idx] + words[idx+1:end]  # omit the word 'bank'
        contexts.append(" ".join(context))    # converts the context word list back into a string and stores it.

# Vectorize contexts using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(contexts) 

In [3]:
# just for viwing TF-IDF
import pandas as pd

# Convert sparse matrix X to dense format and use vectorizer vocabulary as column headers
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Print the TF-IDF Table
print("\n ==== TF-IDF Matrix ==== ")
print(df_tfidf.round(3))
df_tfidf.round(3).to_csv("tfidf_matrix.csv", index=False)


 ==== TF-IDF Matrix ==== 
      at  before   cash  deposited  ...  under  willow  within  worked
0  0.000   0.000  0.000      0.000  ...  0.000   0.000   0.000    0.00
1  0.000   0.000  0.518      0.518  ...  0.000   0.000   0.000    0.00
2  0.000   0.000  0.000      0.000  ...  0.000   0.000   0.000    0.00
3  0.587   0.000  0.000      0.000  ...  0.000   0.000   0.000    0.00
4  0.000   0.365  0.000      0.000  ...  0.000   0.000   0.000    0.00
5  0.000   0.000  0.000      0.000  ...  0.000   0.000   0.000    0.38
6  0.000   0.000  0.000      0.000  ...  0.392   0.392   0.000    0.00
7  0.000   0.000  0.000      0.000  ...  0.000   0.000   0.384    0.00

[8 rows x 37 columns]


In [4]:
# Cluster Contexts
# k-means would group rows (contexts) based on their similarity in the features (columns)
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

In [5]:
# Print cluster assignments
for i, context in enumerate(contexts):
    print(f"Context : {context}")
    print(f"Cluster : {kmeans.labels_[i]}\n")

Context : he sat on the of the river.
Cluster : 0

Context : he deposited cash into the
Cluster : 1

Context : the river overflowed its
Cluster : 0

Context : for a loan at the
Cluster : 1

Context : the pilot made a sharp to the left before landing.
Cluster : 1

Context : he worked hard to enough money for retirement.
Cluster : 1

Context : a picinc on the grassy under the willow tree.
Cluster : 1

Context : is tored in the memory within the processor
Cluster : 1

