## Unsupervised/ Clustering Illustration for NLP

In [1]:
filename = "data/doc1.txt"

with open(filename) as f:
    text = f.read()

In [2]:
# Import for tokenization 
from nltk.tokenize import word_tokenize

In [3]:
# Import for removing frequently occuring words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

In [4]:
# We could have cleaned using stop words
text_tokens = word_tokenize(text)

In [5]:
# Clean word tokens
clean_word_tokens = [w for w in text_tokens if not w in stop_words]

### For vector representation

In [6]:
# We will have vector representation before we can do classification
# Do imports
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Instantiate the object
vectorizer = TfidfVectorizer(stop_words='english')

In [8]:
# fit training data to the count vectorizer
data_tfidf = vectorizer.fit_transform(clean_word_tokens)

### Now clustering setup

In [9]:
# Import
from sklearn.cluster import KMeans

In [10]:
# Define a function to run and print clusters using vectorizer
##  K-means parameters explain here
#   - https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html 
def run_kmeans(k, data_tfidf_format, vectorizer):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10)
    # Fit model to data
    model.fit(data_tfidf)
    # Explain the clusters, i.e., their centroids
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(k):
        print("Cluster %d:" % i),
        for word in order_centroids[i, :10]:
            print("\t%s" % terms[word])

In [11]:
k = 5
run_kmeans(k, data_tfidf, vectorizer)

Cluster 0:
	test
	tests
	models
	checklist
	mft
	inv
	2019
	capabilities
	al
	users
Cluster 1:
	et
	zimmermann
	environment
	encourages
	encouraging
	end
	engine
	engineering
	england
	enjoyed
Cluster 2:
	capability
	zimmermann
	equal
	encouraging
	end
	engine
	engineering
	england
	enjoyed
	ensures
Cluster 3:
	bert
	base
	large
	zimmermann
	engine
	engineering
	england
	enjoyed
	ensures
	entities
Cluster 4:
	model
	agnostic
	encouraging
	end
	engine
	engineering
	england
	enjoyed
	ensures
	entities


In [12]:
# Doc1 is the text of paper -
# "Beyond Accuracy: Behavioral Testing of NLP Models with CheckList", ACL 2020
#  https://aclanthology.org/2020.acl-main.442/

In [13]:
# Now try  doc2

In [14]:
filename = "data/doc2.txt"

with open(filename) as f:
    text = f.read()

In [15]:
# We could have cleaned using stop words
text_tokens = word_tokenize(text)

In [16]:
# Clean word tokens
clean_word_tokens = [w for w in text_tokens if not w in stop_words]

In [17]:
# Instantiate the object
vectorizer = TfidfVectorizer(stop_words='english')

In [18]:
# fit training data to the count vectorizer
data_tfidf = vectorizer.fit_transform(clean_word_tokens)

In [19]:
k = 5
run_kmeans(k, data_tfidf, vectorizer)

Cluster 0:
	drinking
	water
	hazards
	borne
	derived
	quality
	disadvantage
	disasters
	discharges
	discharged
Cluster 1:
	health
	guideline
	value
	treatment
	quality
	concentrations
	used
	based
	exposure
	μg
Cluster 2:
	potential
	μm
	disaster
	discharging
	discharges
	discharged
	discharge
	discernible
	discarding
	discarded
Cluster 3:
	water
	based
	using
	μm
	discharges
	discharged
	discharge
	discernible
	discarding
	discarded
Cluster 4:
	occurrence
	μm
	disasters
	disclaimer
	discharging
	discharges
	discharged
	discharge
	discernible
	discarding


In [20]:
## Doc2 has the text of: 
#  - https://github.com/biplav-s/course-nl-f22/blob/main/sample-code/common-data/WHO-Water/WHO-Drinking-4ed-9789240045064-eng.pdf

## Discussion: how to evaluate clusters?

## Discussion: using labeled data (if available)

## Now try clustering notebook on fake news

In [21]:
# URL: https://github.com/biplav-s/course-nl/blob/master/l9-ml-review/Clustering%20-%20Fake%20news%20Illustration.ipynb