In [2]:
import glob
import pandas as pd

# We are looking for every file inside of this folder
filenames = glob.glob("corpus-20090418/*")
# And we'll read each of them in
content = [open(filename, encoding='latin-1').read() for filename in filenames]

# And the nconvert them to a dataframe

df = pd.DataFrame({
    'filename': filenames,
    'content': content
})
df

Unnamed: 0,filename,content
0,corpus-20090418/g0pC_taskc.txt,The vector space model is where each document ...
1,corpus-20090418/g0pE_taskd.txt,Bayes Theorem is an important theorem relating...
2,corpus-20090418/g0pE_taske.txt,dynamic programming is a method of solving pro...
3,corpus-20090418/g0pC_taskb.txt,There are many attributes which infulance the ...
4,corpus-20090418/g1pA_taske.txt,Dynamic programming is an algorithmic techniqu...
...,...,...
95,corpus-20090418/g1pD_taske.txt,Dynamic programming is a faster method of solv...
96,corpus-20090418/g1pD_taskd.txt,Bayes Theorem is a mathematical formula used t...
97,corpus-20090418/g3pA_taskb.txt,The Google search engine uses a link analysis ...
98,corpus-20090418/g1pB_taskc.txt,The algebraic model for representing text docu...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Make a vectorizer
vectorizer = TfidfVectorizer()

# Learn and count the words in df.content
matrix = vectorizer.fit_transform(df.content)
words_df = pd.DataFrame(matrix.toarray(),
                        columns=vectorizer.get_feature_names(),
                        index=df.index)

words_df.head()

Unnamed: 0,10,15,1702â,1761,1940s,1953,1967,1982,2005,2007,...,yang,year,years,yn,yo,you,your,yours,yourself,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.201277,0.128774,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.131461,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Use KMeans to cluster our datapoints (our pieces of text)

In [65]:
from sklearn.cluster import KMeans

number_of_clusters=5
km = KMeans(n_clusters=number_of_clusters)

# Go put things into groups!!
# use matrix, which is the less attractive words_df
km.fit(matrix)

KMeans(n_clusters=5)

In [67]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :8]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: the pagerank page google of to links is
Cluster 1: the vector document space model of term in
Cluster 2: the programming optimal subproblems dynamic problems to is
Cluster 3: probability the theorem bayes of probabilities is conditional
Cluster 4: classes inheritance the of class is to code


In [68]:
df['cluster'] = km.labels_
df.head(10)

Unnamed: 0,filename,content,cluster
0,corpus-20090418/g0pC_taskc.txt,The vector space model is where each document ...,1
1,corpus-20090418/g0pE_taskd.txt,Bayes Theorem is an important theorem relating...,3
2,corpus-20090418/g0pE_taske.txt,dynamic programming is a method of solving pro...,2
3,corpus-20090418/g0pC_taskb.txt,There are many attributes which infulance the ...,0
4,corpus-20090418/g1pA_taske.txt,Dynamic programming is an algorithmic techniqu...,2
5,corpus-20090418/g3pB_taskd.txt,Bayes' theorem (often called Bayes' law) conne...,3
6,corpus-20090418/g3pB_taske.txt,Dynamic Programming is a method of solving pro...,2
7,corpus-20090418/g0pC_taska.txt,inheritance in object oriented programming is ...,4
8,corpus-20090418/g1pA_taskd.txt,Bayes' theorem relates the conditional and mar...,3
9,corpus-20090418/g0pC_taske.txt,In computer science; dynamic programming is a ...,2


In [69]:
df.cluster.value_counts()

0    20
1    20
2    20
3    20
4    20
Name: cluster, dtype: int64

In [70]:
df[df.cluster == 2]

Unnamed: 0,filename,content,cluster
2,corpus-20090418/g0pE_taske.txt,dynamic programming is a method of solving pro...,2
4,corpus-20090418/g1pA_taske.txt,Dynamic programming is an algorithmic techniqu...,2
6,corpus-20090418/g3pB_taske.txt,Dynamic Programming is a method of solving pro...,2
9,corpus-20090418/g0pC_taske.txt,In computer science; dynamic programming is a ...,2
24,corpus-20090418/g0pD_taske.txt,Dynamic programming (DP) is an extremely power...,2
30,corpus-20090418/orig_taske.txt,"In mathematics and computer science, dynamic p...",2
31,corpus-20090418/g3pC_taske.txt,"In computer science and mathematics, dynamic p...",2
36,corpus-20090418/g0pB_taske.txt,Dynamic programming is a method for solving ma...,2
39,corpus-20090418/g2pA_taske.txt,Dynamic Programming is a very powerful mathema...,2
48,corpus-20090418/g4pE_taske.txt,Dynamic programming is a method for efficient...,2
