## Unsupervised/ Clustering Illustration for NLP

## Do with Common data - Bank Account Info

In [72]:
filename = "../common-data/Example-TDBank-PersonalAcctAgree.txt"

with open(filename) as f:
    text = f.read()

In [83]:
# Import for tokenization 
from nltk.tokenize import word_tokenize

In [84]:
# Import for removing frequently occuring words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

In [85]:
# We could have cleaned using stop words
text_tokens = word_tokenize(text)

In [88]:
# Clean word tokens
clean_word_tokens = [w for w in text_tokens if not w in stop_words]

### For vector representation

In [28]:
# We will have vector representation before we can do classification
# Do imports
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
# Instantiate the object
vectorizer = TfidfVectorizer(stop_words='english')

In [75]:
# fit training data to the count vectorizer
data_tfidf = vectorizer.fit_transform(clean_word_tokens)

### Now clustering setup

In [91]:
# Import
from sklearn.cluster import KMeans

In [92]:
# Define a function to run and print clusters using vectorizer
##  K-means parameters explain here
#   - https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html 
def run_kmeans(k, data_tfidf_format, vectorizer):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=10)
    # Fit model to data
    model.fit(data_tfidf)
    # Explain the clusters, i.e., their centroids
    order_centroids = model.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    for i in range(k):
        print("Cluster %d:" % i),
        for word in order_centroids[i, :10]:
            print("\t%s" % terms[word])

In [104]:
k = 5
run_kmeans(k, data_tfidf, vectorizer)

Cluster 0:
	deposit
	bank
	accounts
	td
	funds
	check
	day
	personal
	checking
	agreement
Cluster 1:
	receipt
	dismissed
	disposed
	dispute
	disputes
	distribute
	distribution
	document
	documentation
	documento
Cluster 2:
	account
	sub
	zipper
	dollars
	disposed
	dispute
	disputes
	distribute
	distribution
	document
Cluster 3:
	available
	zipper
	disposable
	dispute
	disputes
	distribute
	distribution
	document
	documentation
	documento
Cluster 4:
	business
	balance
	day
	zipper
	domestic
	disputes
	distribute
	distribution
	document
	documentation


## We can also run for Fake news data

In [94]:
## Uses data on kaggale at:
#  - https://www.kaggle.com/c/fake-news/data

In [95]:
# imports for file loading
import pandas as pd

## Load and Prepare Data

In [96]:
# Load data
data = pd.read_csv('../common-data/Kaggle-fake-news-train.csv', header=0, lineterminator='\n')

nRow, nCol = data.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 20800 rows and 5 columns in the training set.


In [97]:
# Clean of white spaces
data = train.applymap(lambda x: x.strip() if isinstance(x, str) else x)
# Print statistics
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [98]:
# Removing empty rows from csv 
data.dropna(axis=0,inplace=True)
nRow, nCol = data.shape
print(f'INFO: There are {nRow} rows and {nCol} columns in the training set.')

INFO: There are 18285 rows and 5 columns in the training set.


In [99]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### We will run clustering on titles and also all content

In [100]:
# Add a column with content from title, authot and text. 
# We want to  see cluster with title is better that with total content

In [101]:
data['total'] = data['title'] + ' '+ data['author'] + data['text']

In [102]:
# Check
data.head()

Unnamed: 0,id,title,author,text,label,total
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [105]:
# We will check cluster quality with the label column from data dataframe 
# Apriori, this is how labels are distributed
data["label"].value_counts()

0    10361
1     7924
Name: label, dtype: int64

### Cluster using title text only

In [106]:
# fit training data to the count vectorizer
data_tfidf = vectorizer.fit_transform(data['title'].values)

In [107]:
k = 5
run_kmeans(k, data_tfidf, vectorizer)

Cluster 0:
	election
	war
	world
	2016
	comment
	video
	russia
	america
	obama
	state
Cluster 1:
	hillary
	clinton
	fbi
	email
	emails
	campaign
	new
	wikileaks
	trump
	investigation
Cluster 2:
	new
	york
	times
	trump
	donald
	briefing
	says
	evening
	obama
	dies
Cluster 3:
	breitbart
	trump
	house
	news
	milo
	white
	donald
	obamacare
	report
	president
Cluster 4:
	trump
	donald
	president
	video
	anti
	win
	election
	victory
	obama
	just


### Cluster using all contents 

In [108]:
# fit training data to the count vectorizer
data_tfidf = vectorizer.fit_transform(data['total'].values)

In [109]:
k = 5
run_kmeans(k, data_tfidf, vectorizer)

Cluster 0:
	clinton
	hillary
	fbi
	comey
	emails
	investigation
	campaign
	email
	trump
	election
Cluster 1:
	mr
	trump
	said
	mrs
	clinton
	president
	obama
	campaign
	republican
	new
Cluster 2:
	said
	mr
	new
	people
	police
	like
	ms
	just
	time
	year
Cluster 3:
	russia
	syria
	_____
	russian
	war
	military
	syrian
	said
	mr
	aleppo
Cluster 4:
	trump
	president
	donald
	said
	election
	clinton
	obama
	hillary
	people
	breitbart


## Discussion: how to evaluate clusters?

## Discussion: using labeled data (if available)

In [None]:
# Acknowledgement: https://towardsdatascience.com/applying-machine-learning-to-classify-an-unsupervised-text-document-e7bb6265f52