In [1]:
# Word Frequency Arrays and Clustering
# Creating insight and generalizations on documents

# Rows represent documents (articles, books, pages, etc).
# Columns represent words.
# Entries measure presence of each word in each document.
# Sparse arrays are those where the entries are mostly 0, often seen in Word Frequency Arrays (WFA).


# pip3 install pandas
# pip3 install scipy
# pip3 install scikit-learn
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Loading df
df = pd.read_csv('/Users/alexandergursky/Local_Repository/Datasets/Structured/CSV/Wikipedia articles/wikipedia-vectors.csv', index_col=0)

# csr_matrix remembers only the non-zero entries, this saves space.
articles = csr_matrix(df.transpose())
titles = list(df.columns)


In [2]:
# Creating a TruncatedSVD instance
svd = TruncatedSVD(n_components=50)

# Creating a KMeans cluster instance
kmeans = KMeans(n_clusters=6)

# Creating a pipeline for them both
pipeline = make_pipeline(svd, kmeans)

In [3]:
# Fitting the pipeline to the csr_matrix
pipeline.fit(articles)

# Predicting the cluster labels
labels = pipeline.predict(articles)

# creating the predicted df
pred_df = pd.DataFrame(
    {
        'Label' : labels,
        'Articles' : titles
    }
)

# Display the df sorted by cluster label
print(pred_df.sort_values('Label'))

    Label                                       Articles
41      0                                    Hepatitis B
42      0                                    Doxycycline
43      0                                       Leukemia
44      0                                           Gout
45      0                                    Hepatitis C
46      0                                     Prednisone
47      0                                          Fever
48      0                                     Gabapentin
49      0                                       Lymphoma
40      0                                    Tonsillitis
58      1                                         Sepsis
59      1                                    Adam Levine
54      1                                 Arctic Monkeys
57      1                          Red Hot Chili Peppers
56      1                                       Skrillex
55      1                                  Black Sabbath
53      1                      