In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import xlsxwriter

In [3]:
# the list of documents, sometimes referred to as 'corpus'
list_of_documents = ["Python is a popular programming language.",
             "I do not like snakes, including Pythons.",
             "Some of Monty Python's shows were really fun.",
             "Python is a funny name for a programming language.",
             "Pythons are a family of nonvenomous snakes found in Africa, Asia, and Australia."]

In [4]:
# create the vectorizer and let it know that you want to get rid of english stop words, like 'a', 'the', ..
my_vectorizer = TfidfVectorizer(stop_words='english')

In [5]:
# the vectorizer returns a sparse array ..
my_sparseArray = my_vectorizer.fit_transform(list_of_documents)

In [6]:
#  ... for better understanding, we convert it into a standard matrix
myArray = my_sparseArray.toarray()

In [7]:
# let's get the dimensions of our array
print(myArray.shape)

(5, 18)


In [9]:
# and see what words are actually our features
features = my_vectorizer.get_feature_names()
print(features)

['africa', 'asia', 'australia', 'family', 'fun', 'funny', 'including', 'language', 'like', 'monty', 'nonvenomous', 'popular', 'programming', 'python', 'pythons', 'really', 'shows', 'snakes']


In [38]:
# we create a workbook and worksheet to write the data
workbook = xlsxwriter.Workbook("Tfidf_test1.xlsx")
worksheet = workbook.add_worksheet()

In [39]:
# now we see the words that were extracted, i.e. stopwords are removed
col = 0
for word in features:
    worksheet.write(0, col, word)
    col += 1

In [40]:
myArray.shape

(5, 18)

In [41]:
# now let's add the feature matrix
for j in range(myArray.shape[1]):
    print(j)
    for i in range(myArray.shape[0]):
        worksheet.write(i+1, j, myArray[i][j] )
    last_row = i + 3    # let's remember the last row and add an empty one

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


In [42]:
# find cluster specified number of clusters in the array using the KMeans algorithm
number_of_clusters = 2
model = KMeans(n_clusters=number_of_clusters, init='k-means++', max_iter=100, n_init=1)
model.fit(my_sparseArray)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=2, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [43]:
# let' get the cluster coordinates and add them to the excel
cluster_coordinates = model.cluster_centers_
cluster_coordinates.shape

(2, 18)

In [44]:
print(cluster_coordinates[0])
print("--------------------------------")
print(cluster_coordinates[1])

[0.19917581 0.19917581 0.19917581 0.19917581 0.         0.
 0.27516457 0.         0.27516457 0.         0.19917581 0.
 0.         0.         0.38269483 0.         0.         0.38269483]
--------------------------------
[0.         0.         0.         0.         0.15804155 0.20099492
 0.         0.32432288 0.         0.15804155 0.         0.20099492
 0.32432288 0.37505955 0.         0.15804155 0.15804155 0.        ]


In [45]:
for j in range(cluster_coordinates.shape[1]):
    print(j)
    for i in range(cluster_coordinates.shape[0]):
        worksheet.write(last_row + i, j, cluster_coordinates[i][j] )

workbook.close()        # close the excel file, otherwise it would not be properly created

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


## testing on new data and writing it into the excel

In [46]:
T = my_vectorizer.transform(["I like programming."])
Tarray = T.toarray()
Tarray.shape

(1, 18)

In [47]:
for j in range(Tarray.shape[1]):
    worksheet.write(last_row + 3, j, Tarray[0][j])

In [48]:
classification = model.predict(T)

In [49]:
print(classification)

[1]


In [50]:
workbook.close()        # moved from above to here to close the excel file