In [1]:
import numpy as np
import pandas as pd
import nltk
import re, os, string
from functools import reduce
from math import log

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (16, 4)

pd.options.display.max_columns = 500

**TF-IDF** is useful for clustering tasks, like a document clustering or in other words, tf-idf can help you understand what kind of document you got now. **Term Frequency-Inverse Document Frequency (TF-IDF)** is a numerical statistic that demonstrates how important a word is to a corpus.

**Term Frequency (TF)** is just ratio number of current word to the number of all words in document/string/etc.
<img src="http://drive.google.com/uc?export=view&id=1O_feq-LV4YxxPFGIZCGU-wBy1HaeouUY" />
Frequency of term t_i, where n_t — the number of t_i in current document/string, the sum of n_k is the number of all terms in current document/string.

**Inverse Document Frequency (IDF)** is a log of the ratio of the number of all documents/string in the corpus to the number of documents with term t_i.
<img src="http://drive.google.com/uc?export=view&id=1vweCJa83sy3CkfmOmz71uKO4B-obJLMz" />

tf-idf(t, d, D) is the product tf(t, d) to idf(t, D).
<img src="http://drive.google.com/uc?export=view&id=1wAjC0oJNkNAylEsyAZNmpopLxr8lmRHJ" />
If you want more theoretic information about TF-IDF I want advice you read publication on Wikipedia about it or read NLP Stanford post.
https://en.wikipedia.org/wiki/Tf%E2%80%93idf https://nlp.stanford.edu/IR-book/html/htmledition/tf-idf-weighting-1.html

Let's start with a very simple example of 3 sentences

In [3]:
corpus = """
Simple example with Cats and Mouse
Another simple example with dogs and cats
Another simple example with mouse and cheese
""".split("\n")[1:-1]

corpus

['Simple example with Cats and Mouse',
 'Another simple example with dogs and cats',
 'Another simple example with mouse and cheese']

After that lets make bags of words for our corpus and for every string too. But before we have to clean the data.

In [4]:
l_A = corpus[0].lower().split()
l_B = corpus[1].lower().split()
l_C = corpus[2].lower().split()
print('Line A: ', l_A)
print('Line B: ', l_B)
print('Line C: ', l_C)

word_set = set(l_A).union(set(l_B)).union(set(l_C))
print('Word Set: ', word_set)

word_dict_A = dict.fromkeys(word_set, 0)
word_dict_B = dict.fromkeys(word_set, 0)
word_dict_C = dict.fromkeys(word_set, 0)

for word in l_A:
    word_dict_A[word] += 1

for word in l_B:
    word_dict_B[word] += 1

for word in l_C:
    word_dict_C[word] += 1

pd.DataFrame([word_dict_A, word_dict_B, word_dict_C])

Line A:  ['simple', 'example', 'with', 'cats', 'and', 'mouse']
Line B:  ['another', 'simple', 'example', 'with', 'dogs', 'and', 'cats']
Line C:  ['another', 'simple', 'example', 'with', 'mouse', 'and', 'cheese']
Word Set:  {'simple', 'another', 'cheese', 'example', 'cats', 'and', 'with', 'mouse', 'dogs'}


Unnamed: 0,and,another,cats,cheese,dogs,example,mouse,simple,with
0,1,0,1,0,0,1,1,1,1
1,1,1,1,0,1,1,0,1,1
2,1,1,0,1,0,1,1,1,1


In the case of the term frequency, the simplest choice is to use the raw count of a term in a string. For calculating tf for all terms, we must fill a dictionary as follows.

In [5]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        tf[word] = count/sum_nk
    return tf

tf_A = compute_tf(word_dict_A, l_A)
tf_B = compute_tf(word_dict_B, l_B)
tf_C = compute_tf(word_dict_C, l_C)
pd.DataFrame([tf_A, tf_B, tf_C])

Unnamed: 0,and,another,cats,cheese,dogs,example,mouse,simple,with
0,0.166667,0.0,0.166667,0.0,0.0,0.166667,0.166667,0.166667,0.166667
1,0.142857,0.142857,0.142857,0.0,0.142857,0.142857,0.0,0.142857,0.142857
2,0.142857,0.142857,0.0,0.142857,0.0,0.142857,0.142857,0.142857,0.142857


idf is a measure of how much information the token or word in our case, provides. For calculating idf we need fill dict too.

In [6]:
def compute_idf(strings_list):
    n = len(strings_list)
    idf = dict.fromkeys(strings_list[0].keys(), 0)
    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
    
    for word, v in idf.items():
        idf[word] = log(n / float(v))
    return idf

idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])
pd.DataFrame.from_dict(idf, orient='index', columns=['idf'])

Unnamed: 0,idf
simple,0.0
another,0.405465
cheese,1.098612
example,0.0
cats,0.405465
and,0.0
with,0.0
mouse,0.405465
dogs,1.098612


Now, tf-idf is the product of tf to idf. For our python example, tf-idf is dictionary with the corresponding products.

In [7]:
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf

tf_idf_A = compute_tf_idf(tf_A, idf)
tf_idf_B = compute_tf_idf(tf_B, idf)
tf_idf_C = compute_tf_idf(tf_C, idf)

print('TF-IDF bag of words:')
pd.DataFrame([tf_idf_A, tf_idf_B, tf_idf_C])

TF-IDF bag of words:


Unnamed: 0,and,another,cats,cheese,dogs,example,mouse,simple,with
0,0.0,0.0,0.067578,0.0,0.0,0.0,0.067578,0.0,0.0
1,0.0,0.057924,0.057924,0.0,0.156945,0.0,0.0,0.0,0.0
2,0.0,0.057924,0.0,0.156945,0.0,0.0,0.057924,0.0,0.0


Below you can clearly see the difference between the original bag of words and the new bag of words with tf-idf weights. For example ‘dogs’, ‘cats’ and ‘mouse’ is important words, but word ‘and’ is not important, because this word is in all the strings and we can’t understand what is a string by the word ‘and’.

In [8]:
print('Original bag of words:')
pd.DataFrame([word_dict_A, word_dict_B, word_dict_C])

Original bag of words:


Unnamed: 0,and,another,cats,cheese,dogs,example,mouse,simple,with
0,1,0,1,0,0,1,1,1,1
1,1,1,1,0,1,1,0,1,1
2,1,1,0,1,0,1,1,1,1


Now, that we understand how TF-IDF works, the time has come for real example of clustering with TF-IDF weights. For real life, we can use scikit-learn implementation of TF-IDF and KMeans and suggest to use implementations from scikit-learn or from another popular libraries or frameworks because it’s reducing a number of potential errors in your code.

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [10]:
all_text = """
Google and Facebook are strangling the free press to death. Democracy is the loser
Your 60-second guide to security stuff Google touted today at Next '18
A Guide to Using Android Without Selling Your Soul to Google
Review: Lenovo’s Google Smart Display is pretty and intelligent
Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is
Android is better than IOS
In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency
is a numerical statistic that is intended to reflect
how important a word is to a document in a collection or corpus.
It is often used as a weighting factor in searches of information retrieval
text mining, and user modeling. The tf-idf value increases proportionally
to the number of times a word appears in the document
and is offset by the frequency of the word in the corpus
""".split("\n")[1:-1]

all_text

['Google and Facebook are strangling the free press to death. Democracy is the loser',
 "Your 60-second guide to security stuff Google touted today at Next '18",
 'A Guide to Using Android Without Selling Your Soul to Google',
 'Review: Lenovo’s Google Smart Display is pretty and intelligent',
 'Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is',
 'Android is better than IOS',
 'In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency',
 'is a numerical statistic that is intended to reflect',
 'how important a word is to a document in a collection or corpus.',
 'It is often used as a weighting factor in searches of information retrieval',
 'text mining, and user modeling. The tf-idf value increases proportionally',
 'to the number of times a word appears in the document',
 'and is offset by the frequency of the word in the corpus']

In [11]:
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

In [12]:
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)
tfidf = tfidf_vectorizer.fit_transform(all_text)

feat_names = [string for string in tfidf_vectorizer.get_feature_names()]
tfidf_dataframe = pd.DataFrame(tfidf.todense(), columns=feat_names)
tfidf_dataframe.head()

Unnamed: 0,18,60,and,android,appears,are,as,at,better,by,coast,collection,corpus,death,democracy,display,document,facebook,factor,for,free,frequency,google,greece,guide,how,idf,important,in,increases,information,intelligent,intended,inverse,ios,is,it,knows,lenovo,loser,maps,mining,modeling,mysterious,next,no,number,numerical,object,of,off,offset,often,one,or,press,pretty,proportionally,reflect,retrieval,review,searches,second,security,selling,short,smart,soul,spots,statistic,strangling,stuff,submerged,term,text,tf,tfidf,than,that,the,times,to,today,touted,used,user,using,value,weighting,what,without,word,your
0,0.0,0.0,0.189681,0.0,0.0,0.302486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302486,0.302486,0.0,0.0,0.302486,0.0,0.0,0.302486,0.0,0.189681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.148048,0.0,0.0,0.0,0.302486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.302486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.379362,0.0,0.173853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.298662,0.298662,0.0,0.0,0.0,0.0,0.0,0.298662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187283,0.0,0.257555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.298662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.298662,0.298662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.298662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.171654,0.298662,0.298662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257555
2,0.0,0.0,0.0,0.305934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.222462,0.0,0.305934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354763,0.0,0.0,0.354763,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.407796,0.0,0.0,0.0,0.0,0.354763,0.0,0.0,0.0,0.354763,0.0,0.305934
3,0.0,0.0,0.236572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377265,0.0,0.0,0.0,0.0,0.0,0.0,0.236572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377265,0.0,0.0,0.0,0.184647,0.0,0.0,0.377265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.377265,0.0,0.0,0.0,0.377265,0.0,0.0,0.0,0.0,0.0,0.377265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.15989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15989,0.254979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.124796,0.219885,0.254979,0.0,0.0,0.254979,0.0,0.0,0.254979,0.0,0.254979,0.0,0.0,0.254979,0.175671,0.254979,0.0,0.0,0.254979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254979,0.0,0.0,0.0,0.254979,0.0,0.0,0.0,0.0,0.0,0.0,0.15989,0.0,0.0,0.0,0.0,0.0,0.219885,0.0,0.0,0.0,0.254979,0.0,0.0,0.0


In [13]:
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters).fit(tfidf)
clusters = kmeans.labels_.tolist()
clusters

[1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]

In [14]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

print("Top terms per cluster:")
print()
for i in range(num_clusters):
    print("Cluster %d words: " % i, end='')
    cluster_words = []
    
    for ind in order_centroids[i, :20]:
        cluster_words.append(feat_names[ind])
    
    print(cluster_words)
    print()

Top terms per cluster:

Cluster 0 words: ['the', 'in', 'word', 'document', 'frequency', 'of', 'corpus', 'or', 'idf', 'tf', 'information', 'retrieval', 'is', 'to', 'and', 'collection', 'important', 'how', 'times', 'appears']

Cluster 1 words: ['is', 'google', 'to', 'android', 'and', 'guide', 'your', 'the', 'better', 'than', 'ios', 'reflect', 'numerical', 'that', 'intended', 'statistic', 'lenovo', 'review', 'smart', 'intelligent']



In [15]:
frame = pd.DataFrame(all_text, index = [clusters], columns = ['text'])
frame

Unnamed: 0,text
1,Google and Facebook are strangling the free pr...
1,Your 60-second guide to security stuff Google ...
1,A Guide to Using Android Without Selling Your ...
1,Review: Lenovo’s Google Smart Display is prett...
1,Google Maps user spots mysterious object subme...
1,Android is better than IOS
0,"In information retrieval, tf–idf or TFIDF, sho..."
1,is a numerical statistic that is intended to r...
0,how important a word is to a document in a col...
0,It is often used as a weighting factor in sear...


In [16]:
lines_for_predicting = ["tf and idf is awesome!", "some androids is there"]
kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))

array([0, 1])

### Text Classification & Text Clustering
Text classification is a problem where we have fixed set of classes/categories and any given text is assigned to one of these categories. In contrast, Text clustering is the task of grouping a set of unlabeled texts in such a way that texts in the same group (called a cluster) are more similar to each other than to those in other clusters.

Let's implement the above two tasks using well-known machine algorithms: K-NN and K-Means respectively. Based on the implementation let's see whether both algorithms co-relate to each other or represent the same information.

For training K-NN and K-Means models, we will be using 30 sentences which were collected from 3 categories, namely Cricket, Artificial Intelligence and Chemistry. These 30 sentences are stored in a text file which we will use.

In [17]:
# Importing libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import numpy as np
from collections import Counter

In [18]:
train_file = "train_sentences.csv"
train_lines_df = pd.read_csv(train_file)
train_lines_df.head()

Unnamed: 0,Sentence,Label
0,Cricket is a bat and ball game played between...,Cricket
1,Each phase of play is called an innings durin...,Cricket
2,Artificial intelligence is intelligence exhib...,Artifical Intillengence
3,the field of AI research defines itself as th...,Artifical Intillengence
4,A compound is a pure chemical substance compo...,Chemistry


In [19]:
train_lines_df['Label'].value_counts()

Cricket                    10
Chemistry                  10
Artifical Intillengence    10
Name: Label, dtype: int64

In [20]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

# Cleaning the text sentences so that punctuation marks, stop words & digits are removed  
def clean(input_line):
    stop_free = " ".join([i for i in input_line.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    processed = re.sub(r"\d+","",normalized)
    final_line = processed.split()
    return final_line

In [21]:
train_clean_sentences = []
for line in train_lines_df['Sentence'].values:
    line = line.strip()
    cleaned = clean(line)
    cleaned = ' '.join(cleaned)
    train_clean_sentences.append(cleaned)

train_lines_df['Cleaned_Sentence'] = train_clean_sentences
train_lines_df.head()

Unnamed: 0,Sentence,Label,Cleaned_Sentence
0,Cricket is a bat and ball game played between...,Cricket,cricket bat ball game played two team eleven p...
1,Each phase of play is called an innings durin...,Cricket,phase play called inning one team bat attempti...
2,Artificial intelligence is intelligence exhib...,Artifical Intillengence,artificial intelligence intelligence exhibited...
3,the field of AI research defines itself as th...,Artifical Intillengence,field ai research defines study intelligent ag...
4,A compound is a pure chemical substance compo...,Chemistry,compound pure chemical substance composed one ...


In [22]:
target_label = train_lines_df['Label']

# Preparing target labels
label_factor = pd.factorize(target_label)
target_label_class = label_factor[0]
label_definitions = label_factor[1]

label_range = len(label_definitions)
reversefactor = dict(zip(range(label_range), label_definitions))

print(label_definitions)
print(np.unique(target_label_class))
print('length of salary slabs: ', label_range)

Index(['Cricket', 'Artifical Intillengence', 'Chemistry'], dtype='object')
[0 1 2]
length of salary slabs:  3


In [23]:
# Preparing data for model
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(train_lines_df['Cleaned_Sentence'])
X_feat_names = [string for string in vectorizer.get_feature_names()]
X_train = pd.DataFrame(vectorizer.transform(train_lines_df['Cleaned_Sentence']).todense(), columns=X_feat_names)

y = target_label_class

In [24]:
# Clustering the document with KNN classifier
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [25]:
# Clustering the training 30 sentences with K-means technique
num_clusters = 3
model_kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=200, n_init=100)
model_kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=200,
    n_clusters=3, n_init=100, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [26]:
order_centroids = model_kmeans.cluster_centers_.argsort()[:, ::-1]

print("Top terms per cluster:")
print()
for i in range(num_clusters):
    print("Cluster %d words: " % i, end='')
    cluster_words = []
    
    for ind in order_centroids[i, :10]:
        cluster_words.append(X_feat_names[ind])
    
    print(cluster_words)
    print()

Top terms per cluster:

Cluster 0 words: ['team', 'run', 'ball', 'inning', 'bat', 'score', 'batsman', 'cricket', 'opponent', 'main']

Cluster 1 words: ['property', 'chemical', 'element', 'atom', 'substance', 'chemistry', 'electron', 'called', 'science', 'compound']

Cluster 2 words: ['intelligence', 'machine', 'ai', 'intelligent', 'success', 'human', 'computer', 'learning', 'research', 'artificial']



From the words related to cluster we can infer that, **Cluster 0 = Criket, Cluster 1 = Chemistry & Cluster 2 = Artificial Intelligence.** However, the labelling notations might change whenver we train the models but clusters will remain same.

In [30]:
# Update these based on the cluster data and notations
cluster_label_map = {0: 'Cricket', 1: 'Chemistry', 2: 'Artificial Intelligence'}

In [27]:
test_file = "test_sentences.csv"
test_lines_df = pd.read_csv(test_file)
test_lines_df

Unnamed: 0,Sentence
0,Chemical compunds are used for preparing bombs...
1,Cricket is a boring game where the batsman onl...
2,Machine learning is an area of Artificial inte...


In [28]:
test_clean_sentences = []
for line in test_lines_df['Sentence'].values:
    line = line.strip()
    cleaned = clean(line)
    cleaned = ' '.join(cleaned)
    test_clean_sentences.append(cleaned)

test_lines_df['Cleaned_Sentence'] = test_clean_sentences
test_lines_df.head()

Unnamed: 0,Sentence,Cleaned_Sentence
0,Chemical compunds are used for preparing bombs...,chemical compunds used preparing bomb based re...
1,Cricket is a boring game where the batsman onl...,cricket boring game batsman enjoys game
2,Machine learning is an area of Artificial inte...,machine learning area artificial intelligence


In [31]:
X_test = pd.DataFrame(vectorizer.transform(test_lines_df['Cleaned_Sentence']).todense(), columns=X_feat_names)

pred_knn = model_knn.predict(X_test)
pred_kmeans = model_kmeans.predict(X_test)

In [33]:
pred_knn_lbl = np.vectorize(reversefactor.get)(pred_knn)

test_lines_df['pred_knn'] = pred_knn_lbl
test_lines_df['pred_kmeans'] = pred_kmeans
test_lines_df['pred_kmeans'] = test_lines_df['pred_kmeans'].map(cluster_label_map)

test_lines_df

Unnamed: 0,Sentence,Cleaned_Sentence,pred_knn,pred_kmeans
0,Chemical compunds are used for preparing bombs...,chemical compunds used preparing bomb based re...,Chemistry,Chemistry
1,Cricket is a boring game where the batsman onl...,cricket boring game batsman enjoys game,Cricket,Cricket
2,Machine learning is an area of Artificial inte...,machine learning area artificial intelligence,Artifical Intillengence,Artificial Intelligence
