## Topic extraction

1. Write several texts
2. Extract non-rubbish words
3. Apply NMF to get some components
4. Do reconstructions of the texts => how well does this return the original texts?
4. Try clustering on the words => can I extract topics?

In [1]:
# write texts

text_1 = "The sun is shining outside, it's a nice day."
text_2 = "There may be showers outside, in which case the day would not be nice."
text_3 = "Sun, showers and wind are all very common in the UK on any particular day."

text_4 = "Machine learning is an interesting topic to study."
text_5 = "Studying is a good thing to do, whether it's for machine learning or some other topic."
text_6 = "Good students can master machine learning quickly and move on to other exciting topics."

# a list of text pieces
texts = [text_1, text_2, text_3, text_4, text_5, text_6]

# one large piece of text
# all_text = ""
# for text in texts:
#     all_text += text

In [2]:
# extract non-rubbish words
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')
vect.fit(texts)

print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content: {}".format(vect.vocabulary_))

Vocabulary size: 24
Vocabulary content: {'sun': 18, 'shining': 13, 'outside': 10, 'nice': 9, 'day': 2, 'showers': 14, 'case': 0, 'wind': 23, 'common': 1, 'uk': 22, 'particular': 11, 'machine': 7, 'learning': 6, 'interesting': 5, 'topic': 20, 'study': 16, 'studying': 17, 'good': 4, 'thing': 19, 'students': 15, 'master': 8, 'quickly': 12, 'exciting': 3, 'topics': 21}


In [3]:
# stemming
# !conda install nltk spacy

In [4]:
# stemming
# !conda install spacy

In [5]:
# !python -m spacy download en

In [9]:
# one-hot-encode non-rubbish words

wordbags = vect.transform(texts)
print("Wordbags: {}".format(repr(wordbags)))
print("Dense representation of wordbags:\n{}".format(wordbags.toarray()))

# could normalise, so sum of squares (length of vector) per text is constant

Wordbags: <6x24 sparse matrix of type '<class 'numpy.int64'>'
	with 36 stored elements in Compressed Sparse Row format>
Dense representation of wordbags:
[[0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1]
 [0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]
 [0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0]]


In [20]:
# apply NMF to get some components - eg 2 components

from sklearn.decomposition import NMF
nmf = NMF(n_components=2, random_state=0)
# fit NMF model to our joint large piece of text
nmf.fit(wordbags)
# show the obtained components
print("nmf.components_\n", nmf.components_, "\n\n")

# transform texts
wordbags_nmf = nmf.transform(wordbags)
# show transformed data
print("wordbags_nmf\n", wordbags_nmf)

nmf.components_
 [[0.         0.         0.         0.36664321 0.67797954 0.24265309
  0.92063263 0.92063263 0.36664321 0.         0.         0.
  0.36664321 0.         0.         0.36664321 0.24265309 0.31133632
  0.         0.31133632 0.55398942 0.36664321 0.         0.        ]
 [0.30189078 0.35808492 0.96186648 0.         0.         0.
  0.         0.         0.         0.60378155 0.60378155 0.35808492
  0.         0.30189078 0.6599757  0.         0.         0.
  0.6599757  0.         0.         0.         0.35808492 0.35808492]] 


wordbags_nmf
 [[0.         0.97227023]
 [0.         0.97227023]
 [0.         1.15324925]
 [0.83604845 0.        ]
 [1.0726929  0.        ]
 [1.26324987 0.        ]]




In [26]:
wordbags_nmf[2][1] / wordbags_nmf[0][1] * 5

5.930703308172537

In [23]:
type(wordbags_nmf[0][0])

numpy.float64

In [35]:
# reconstruct texts from NMF components - how well does this work?

reconstruction = nmf.inverse_transform(wordbags_nmf)
print ("wordbags:\n", wordbags.toarray(), '\n\nreconstruction:\n', reconstruction)

wordbags:
 [[0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1]
 [0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]
 [0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0]] 

reconstruction:
 [[0.29351941 0.34815531 0.93519414 0.         0.         0.
  0.         0.         0.         0.58703883 0.58703883 0.34815531
  0.         0.29351941 0.64167473 0.         0.         0.
  0.64167473 0.         0.         0.         0.34815531 0.34815531]
 [0.29351941 0.34815531 0.93519414 0.         0.         0.
  0.         0.         0.         0.58703883 0.58703883 0.34815531
  0.         0.29351941 0.64167473 0.         0.         0.
  0.64167473 0.         0.         0.         0.34815531 0.34815531]
 [0.34815531 0.41296117 1.1092718  0.         0.         0.
  0.         0.         0.         0.69631062 0.69631062 0.41296117
  0.         0.348155

In [55]:
type(wordbags)

scipy.sparse.csr.csr_matrix

In [56]:
import numpy as np

# for AgglomerativeClustering: it complained that data was sparse
y = wordbags.todense()

# dense => bmp
# sparse => just say posiitons for 1s

In [57]:
print(y)
type(y)

[[0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1]
 [0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0]
 [0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0]
 [0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0]]


numpy.matrix

In [59]:
# Agg

from sklearn.cluster import AgglomerativeClustering
n_clusters = 2
agg = AgglomerativeClustering(n_clusters=n_clusters)

assignement = agg.fit_predict(y)
display(assignement)

array([1, 1, 1, 0, 0, 0])

In [70]:
# try clustering on the words - can I extract topics?
# BTW it's fun to play with the number of clusters - eg go from 2 to 3 or 4

from sklearn.cluster import KMeans
n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters)

assignement = kmeans.fit_predict(wordbags)
display(assignement)

array([1, 1, 1, 0, 0, 0], dtype=int32)

In [71]:
# distances from each cluster centre

kmeans.transform(wordbags)

array([[2.98142397, 1.33333333],
       [2.98142397, 1.33333333],
       [3.29983165, 1.76383421],
       [1.49071198, 2.90593263],
       [1.37436854, 3.07318149],
       [1.79505494, 3.38296386]])

In [63]:
kmeans.labels_

array([0, 0, 0, 1, 1, 1], dtype=int32)

In [64]:
kmeans.score()

-14.000000000000005

In [73]:
from sklearn.cluster import KMeans
numbers = [1,2,3,4,5,6]

for n_clusters in numbers:
    print("cluster number:", n_clusters)
    kmeans = KMeans(n_clusters=n_clusters)
    assignement = kmeans.fit_predict(wordbags)
    print(assignement)
    print(kmeans.score(wordbags), "\n\n")

cluster number: 1
[0 0 0 0 0 0]
-25.0 


cluster number: 2
[1 1 1 0 0 0]
-14.000000000000005 


cluster number: 3
[0 0 0 2 2 1]
-9.166666666666668 


cluster number: 4
[1 1 3 0 0 2]
-4.5 


cluster number: 5
[1 1 3 2 4 0]
-2.0 


cluster number: 6
[0 5 4 3 1 2]
-0.0 




In [52]:
# let's see what the distance is from each datapoint to their cluster centre

# get cluster coordinates
kmeans.cluster_centers_

# then give up

array([[0.33333333, 0.33333333, 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.66666667,
        0.66666667, 0.33333333, 0.        , 0.33333333, 0.66666667,
        0.        , 0.        , 0.        , 0.66666667, 0.        ,
        0.        , 0.        , 0.33333333, 0.33333333],
       [0.        , 0.        , 0.        , 0.33333333, 0.66666667,
        0.33333333, 1.        , 1.        , 0.33333333, 0.        ,
        0.        , 0.        , 0.33333333, 0.        , 0.        ,
        0.33333333, 0.33333333, 0.33333333, 0.        , 0.33333333,
        0.66666667, 0.33333333, 0.        , 0.        ]])

In [None]:
# for i in n_clusters:
#     dataInCluster = wordbags[assignement[cluster==i].rowNames,]
#     distance = norm(dataInCluster-clusterCenter[i])