In [41]:
import pandas as pd
from scipy.sparse import csr_matrix

df = pd.read_csv('Wikipedia Articles/wikipedia-vectors.csv', index_col=0)
articles = csr_matrix(df.transpose())
titles = list(df.columns)

In [42]:
articles.shape

(60, 13125)

the articles has 13125 features , so lets do some NMF to reduce its features to include only the most important components.

In [44]:
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components = 6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features.round(2))


[[0.   0.   0.   0.   0.   0.44]
 [0.   0.   0.   0.   0.   0.57]
 [0.   0.   0.   0.   0.   0.4 ]
 [0.   0.   0.   0.   0.   0.38]
 [0.   0.   0.   0.   0.   0.49]
 [0.01 0.01 0.01 0.03 0.   0.33]
 [0.   0.   0.02 0.   0.01 0.36]
 [0.   0.   0.   0.   0.   0.49]
 [0.02 0.01 0.   0.02 0.03 0.48]
 [0.01 0.03 0.03 0.07 0.02 0.34]
 [0.   0.   0.53 0.   0.03 0.  ]
 [0.   0.   0.36 0.   0.   0.  ]
 [0.01 0.01 0.31 0.06 0.01 0.02]
 [0.   0.01 0.34 0.01 0.   0.  ]
 [0.   0.   0.43 0.   0.04 0.  ]
 [0.   0.   0.48 0.   0.   0.  ]
 [0.01 0.02 0.38 0.03 0.   0.01]
 [0.   0.   0.48 0.   0.   0.  ]
 [0.   0.01 0.55 0.   0.   0.  ]
 [0.   0.   0.47 0.   0.   0.  ]
 [0.   0.01 0.02 0.52 0.06 0.01]
 [0.   0.   0.   0.51 0.   0.  ]
 [0.   0.01 0.   0.42 0.   0.  ]
 [0.   0.   0.   0.44 0.   0.  ]
 [0.   0.   0.   0.5  0.   0.  ]
 [0.1  0.09 0.   0.38 0.   0.01]
 [0.   0.   0.   0.57 0.   0.01]
 [0.01 0.01 0.   0.47 0.   0.01]
 [0.   0.   0.   0.58 0.   0.  ]
 [0.   0.   0.   0.53 0.01 0.01]
 [0.   0.4



In [45]:
nmf_features.shape

(60, 6)

Now, it has only 6 features representing an article.

In [46]:
# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features,index=titles)

# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])

0    0.003847
1    0.000000
2    0.000000
3    0.575660
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
0    0.000000
1    0.005601
2    0.000000
3    0.422343
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64


 Notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component

In [52]:
nmf_features.shape

(60, 6)

In [47]:
model.components_.shape

(6, 13125)

## NMF learns topics of documents

when NMF is applied to documents, the components correspond to topics of documents, and the NMF features reconstruct the documents from the topics.

In [48]:
my_file = open("Wikipedia Articles/wikipedia-vocabulary-utf8.txt", "r")
content = my_file.read()
words = content.split("\n")
my_file.close()

In [49]:
# Import pandas
import pandas as pd

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_,columns=words)

# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3]

# Print result of nlargest
print(component.nlargest())

(6, 13125)
film       0.627931
award      0.253154
starred    0.245306
role       0.211469
actress    0.186414
Name: 3, dtype: float64


The topics that the articles about Anne Hathaway and Denzel Washington have in common!

In [50]:
components_df

Unnamed: 0,aaron,abandon,abandoned,abandoning,abandonment,abbas,abbey,abbreviated,abbreviation,abc,...,zealand,zenith,zeppelin,zero,zeus,zimbabwe,zinc,zone,zones,zoo
0,0.011372,0.001209,0.0,0.001738,0.000136,0.0,0.0,0.002463,2.445016e-07,0.000834,...,0.025772,0.0,0.008321,0.0,0.0,0.0,0.0,0.0,0.000423,0.0
1,0.0,1e-05,0.005663,0.0,2e-06,0.0,0.0,0.000566,0.000500258,0.0,...,0.008106,0.0,0.0,0.00171,0.0,0.0,0.0,0.002813,0.000297,0.0
2,0.0,8e-06,0.0,0.0,0.004692,0.0,0.0,0.000758,1.604236e-05,0.0,...,0.00873,0.0,0.0,0.001317,0.0,0.0,0.0,0.0,0.000143,0.0
3,0.004148,0.0,0.003056,0.0,0.000614,0.0,0.0,0.002436,8.143976e-05,0.003985,...,0.012595,0.0,0.0,0.0,0.0,0.0,0.0,0.001742,0.00672,0.0
4,0.0,0.000569,0.004919,0.0,0.0,0.0,0.0,8.9e-05,4.260964e-05,0.0,...,0.00181,0.0,0.0,1.7e-05,0.0,0.0,0.0,0.000192,0.001352,0.0
5,0.000138,0.0,0.008745,0.0,0.000185,0.0,0.0,0.008626,1.529783e-05,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0024,0.001681,0.0
