In [33]:
# Non-negative matrix factorization (NMF)

# NMF stands for "non-negative matrix factorization". NMF, like PCA, is a dimension reduction technique. 
# In constract to PCA, however, NMF models are interpretable. 
# This means an NMF model is both easier to understand yourself, and much easier for you to explain to others. 
# However, NMF can not be applied to every dataset.
# It is required that the sample features be "non-negative", so greater than or equal to 0.

# pip3 install pandas
# pip3 install scikit-learn
# pip3 install scipy
import pandas as pd
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix

# Loading df
df = pd.read_csv('/Users/alexandergursky/Local_Repository/Datasets/Dataset_Package/Wikipedia articles/wikipedia-vectors.csv', index_col=0)

# csr_matrix is a data type that remembers only the non-zero entries, this saves space.
articles = csr_matrix(df.transpose())
titles = list(df.columns)


In [34]:
# Creating an NMF instance
model = NMF(n_components=6)

# Fitting the model to articles (our data)
model.fit(articles)

# Transform the articles to work with our model
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features.round(2))

[[0.   0.   0.   0.   0.   0.44]
 [0.   0.   0.   0.   0.   0.57]
 [0.   0.   0.   0.   0.   0.4 ]
 [0.   0.   0.   0.   0.   0.38]
 [0.   0.   0.   0.   0.   0.49]
 [0.01 0.01 0.01 0.03 0.   0.33]
 [0.   0.   0.02 0.   0.01 0.36]
 [0.   0.   0.   0.   0.   0.49]
 [0.02 0.01 0.   0.02 0.03 0.48]
 [0.01 0.03 0.03 0.07 0.02 0.34]
 [0.   0.   0.53 0.   0.03 0.  ]
 [0.   0.   0.36 0.   0.   0.  ]
 [0.01 0.01 0.31 0.06 0.01 0.02]
 [0.   0.01 0.34 0.01 0.   0.  ]
 [0.   0.   0.43 0.   0.04 0.  ]
 [0.   0.   0.48 0.   0.   0.  ]
 [0.01 0.02 0.38 0.03 0.   0.01]
 [0.   0.   0.48 0.   0.   0.  ]
 [0.   0.01 0.55 0.   0.   0.  ]
 [0.   0.   0.47 0.   0.   0.  ]
 [0.   0.01 0.02 0.52 0.06 0.01]
 [0.   0.   0.   0.51 0.   0.  ]
 [0.   0.01 0.   0.42 0.   0.  ]
 [0.   0.   0.   0.44 0.   0.  ]
 [0.   0.   0.   0.5  0.   0.  ]
 [0.1  0.09 0.   0.38 0.   0.01]
 [0.   0.   0.   0.57 0.   0.01]
 [0.01 0.01 0.   0.47 0.   0.01]
 [0.   0.   0.   0.58 0.   0.  ]
 [0.   0.   0.   0.53 0.01 0.01]
 [0.   0.4



In [35]:
# Create prediction df
article_pred_df = pd.DataFrame(nmf_features, index=titles)

# Print the row for 'Anne Hathaway'
print(article_pred_df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(article_pred_df.loc['Denzel Washington'])

# NMF components represent topics
# Looking at the output we can see that feature '3' which is really the 4th contains the highest value
# As both of these are actors, this make sense. Feature 3 would be used to construct both articles. 
# NMF learns topics of documents.

0    0.003845
1    0.000000
2    0.000000
3    0.575712
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64
0    0.000000
1    0.005601
2    0.000000
3    0.422381
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64


In [36]:
# Getting the words
words_df = pd.read_csv('/Users/alexandergursky/Local_Repository/Datasets/Dataset_Package/Wikipedia articles/wikipedia-vocabulary-utf8.txt',header=None)

# Extracting first column, the values, then turning them into a list.
words_ls = words_df.iloc[:,0].values.tolist()

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_ , columns=words_ls)

# Print the shape of the DataFrame
print(components_df.shape)

# Select row 3: component
component = components_df.iloc[3]

# Print result of nlargest
print(component.nlargest())

(6, 13125)
film       0.627875
award      0.253131
starred    0.245283
role       0.211450
actress    0.186397
Name: 3, dtype: float64
