# Boolean-Matrix Factorization
### Adam Fletcher

## Using Nimfa Library

In [None]:
import numpy as np
import scipy.sparse as sp

import nimfa

V = np.random.rand(40, 100)
bmf = nimfa.Bmf(V, seed="nndsvd", rank=10, max_iter=12, lambda_w=1.1, lambda_h=1.1)
bmf_fit = bmf()

In [None]:
V

In [None]:
def __fact_factor(X):
    """
    Return dense factorization factor, so that output is printed nice if factor is sparse.
     
    :param X: Factorization factor.
    :type X: :class:`scipy.sparse` of format csr, csc, coo, bsr, dok, lil, dia or :class:`numpy.matrix`
    """
    return X.todense() if sp.isspmatrix(X) else X


def print_info(fit, idx=None):
    """
    Print to stdout info about the factorization.
     
    :param fit: Fitted factorization model.
    :type fit: :class:`nimfa.models.mf_fit.Mf_fit`
    :param idx: Name of the matrix (coefficient) matrix. Used only in the multiple NMF model. Therefore in factorizations 
                that follow standard or nonsmooth model, this parameter can be omitted. Currently, SNMNMF implements 
                multiple NMF model.
    :type idx: `str` with values 'coef' or 'coef1' (`int` value of 0 or 1, respectively) 
    """
    print("=================================================================================================")
    print("Factorization method:", fit.fit)
    print("Initialization method:", fit.fit.seed)
    print("Basis matrix W: ")
    print(__fact_factor(fit.basis()))
    print("Mixture (Coefficient) matrix H%d: " % (idx if idx != None else 0))
    print(__fact_factor(fit.coef(idx)))
    print("Distance (Euclidean): ", fit.distance(metric='euclidean', idx=idx))
    # We can access actual number of iteration directly through fitted model.
    # fit.fit.n_iter
    print("Actual number of iterations: ", fit.summary(idx)['n_iter'])
    # We can access sparseness measure directly through fitted model.
    # fit.fit.sparseness()
    print("Sparseness basis: %7.4f, Sparseness mixture: %7.4f" % (fit.summary(idx)['sparseness'][0], fit.summary(idx)['sparseness'][1]))
    # We can access explained variance directly through fitted model.
    # fit.fit.evar()
    print("Explained variance: ", fit.summary(idx)['evar'])
    # We can access residual sum of squares directly through fitted model.
    # fit.fit.rss()
    print("Residual sum of squares: ", fit.summary(idx)['rss'])
    # There are many more ... but just cannot print out everything =] and some measures need additional data or more runs
    # e.g. entropy, predict, purity, coph_cor, consensus, select_features, score_features, connectivity
    print("=================================================================================================")

## Test Data

In [None]:
print_info(bmf_fit);

## OLX Data

In [None]:
import pandas as pd
seekers_data = pd.read_csv("./job_persona_data/seekers.csv")

In [None]:
seekers_data.head()

**Data Cleanup To Do:**
- Turn True/False into 1's and 0's (Already a Bool)
- What are the different platforms (Only Desktop) -- CAN REMOVE
- top_persona is my label! (Though this is an unsupervised process)

In [None]:
seekers_data.dtypes

In [None]:
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x not in unique_list: 
            unique_list.append(x) 
    # print list 
    for x in unique_list: 
        print(x)
        
unique(seekers_data['platform'])

In [None]:
# Removing Unneeded Columns

cleaned_seekers_data = seekers_data[[
    #'user_id',
    'survive',
    'escape',
    'quick_process',
    'getting_inspired',
    'structured_process',
    'boost_my_cv',
    'quick_scan',
    'prove_to_self',
    'compare_options',
    'rich_data',
    'unbiased_information',
    'dry_data',
    'clear_job_information',
    'human_verification',
    'head_start',
    'treated_with_respect',
    'standing_out',
    'present_real_me',
    'manage_insecurity',
    'being_informed',
    'learn_from_process',
    'values',
    'money',
    'stability',
    'atmosphere',
    'balance',
    'happiness',
    'development',
    'promotion',
    'freedom',
    'missed_interview',
    'shortcutter',
    'survivor',
    'dreamer',
    'prepared',
    'perfect',
    'believer'
]]

In [None]:
seekers_array = cleaned_seekers_data.to_numpy()
seekers_array

## Define Nimfa Model

In [None]:
# Define the Model

seekers_bmf = nimfa.Bmf(
    seekers_array, 
    seed="nndsvd", 
    rank=8, 
    max_iter=50000, 
    lambda_w=1.1, 
    lambda_h=1.1)

In [None]:
seekers_bmf_fit = seekers_bmf()

In [None]:
#print_info(seekers_bmf_fit)

In [None]:
seekers_W = __fact_factor(seekers_bmf_fit.basis())
seekers_W = pd.DataFrame(seekers_W)

In [None]:
import seaborn as sns
sns.heatmap(seekers_W)

In [None]:
seekers_H = __fact_factor(seekers_bmf_fit.coef(idx= None))
seekers_H = pd.DataFrame(seekers_H)

In [None]:
pd.DataFrame(seekers_array).to_csv('seekers_array.csv')

## Predict Original Array

In [None]:
predictions = pd.DataFrame(np.dot(seekers_W, seekers_H))
predictions.head()

In [None]:
predictions.iloc[row][column]

In [None]:
# 10453 rows 37 columns
rounded_predictions = predictions

for column in rounded_predictions:
    
    for row in range(len(rounded_predictions)):
        if rounded_predictions.iloc[row][column] >= 0.5:
            rounded_predictions.iloc[row][column] = 1
        else:
            rounded_predictions.iloc[row][column] = 0


In [None]:
rounded_predictions.head()

## Calculate Error Rate

In [None]:
difference = rounded_predictions - ((pd.DataFrame(seekers_array)) * 1)
difference = difference.abs()
#sns.heatmap(difference)
#difference.head()
diff_vector = difference.sum()
print("Error Rate:", (diff_vector.sum() / (10453 * 37)) * 100, "%"   )

## Sci-kit Learn NMF

### Test Data

In [None]:
X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
from sklearn.decomposition import NMF
model = NMF(n_components=6, init='random', random_state=0)
W = model.fit_transform(X)
H = model.components_

In [None]:
pd.DataFrame(X).head()

In [None]:
pd.DataFrame(W).head()

In [None]:
pd.DataFrame(H).head()

In [None]:
pd.DataFrame(np.dot(W, H))

### OLX Data

In [None]:
# Model Definition
model = NMF(n_components=6, init='nndsvda', random_state=0)


scikit_seekers_W = model.fit_transform(seekers_array)
scikit_seekers_H = model.components_

In [None]:
pd.DataFrame(scikit_seekers_W).head()

In [None]:
pd.DataFrame(scikit_seekers_H)

### Predicted Matrix and Error Rate

In [None]:
scikit_predictions = pd.DataFrame(np.dot(scikit_seekers_W, scikit_seekers_H))
scikit_predictions.head()

In [None]:
for column in scikit_predictions:
    
    for row in range(len(scikit_predictions)):
        if scikit_predictions.iloc[row][column] >= 0.5:
            scikit_predictions.iloc[row][column] = 1
        else:
            scikit_predictions.iloc[row][column] = 0


In [None]:
scikit_predictions.head()

In [None]:
difference = scikit_predictions - ((pd.DataFrame(seekers_array)) * 1)
difference = difference.abs()
#sns.heatmap(difference)
#difference.head()
diff_vector = difference.sum()
print("Error Rate:", (diff_vector.sum() / (10453 * 37)) * 100, "%"   )

# Alternative Approach
# k-means clustering

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(
    n_clusters=4, 
    random_state=0).fit(seekers_array)

In [None]:
kmeans.labels_

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components= 2)

principal_components = pca.fit_transform(seekers_array)

principalDf = pd.DataFrame(data = principal_components
             , columns = [
                 'principal component 1', 
                 'principal component 2']) #,
                 #'principal component 3'])

principalDf.head()

In [None]:
principalDf['cluster'] = kmeans.labels_

import matplotlib

matplotlib.pyplot.scatter(
    principalDf['principal component 1'], 
    principalDf['principal component 2'], 
    s=None, 
    c=principalDf['cluster'],
    cmap = 'Set1')

In [None]:
# This import registers the 3D projection, but is otherwise unused.
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

import matplotlib.pyplot as pyplot
import numpy as np

fig = pyplot.figure()
ax = Axes3D(fig)

sequence_containing_x_vals = principalDf['principal component 1']
sequence_containing_y_vals = principalDf['principal component 2']
sequence_containing_z_vals = principalDf['principal component 3']


ax.scatter(sequence_containing_x_vals, 
           sequence_containing_y_vals, 
           sequence_containing_z_vals,
           c=principalDf['cluster'],
           cmap = 'Set1')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
pyplot.show()

In [None]:
# This import registers the 3D projection, but is otherwise unused.
from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import

import matplotlib.pyplot as pyplot
import numpy as np

fig = pyplot.figure()
ax = Axes3D(fig)

sequence_containing_x_vals = principalDf['principal component 2']
sequence_containing_y_vals = principalDf['principal component 3']
sequence_containing_z_vals = principalDf['principal component 1']


ax.scatter(sequence_containing_x_vals, 
           sequence_containing_y_vals, 
           sequence_containing_z_vals,
           c=principalDf['cluster'],
           cmap = 'Set1')
ax.set_xlabel('PC2')
ax.set_ylabel('PC3')
ax.set_zlabel('PC1')
pyplot.show()

## Comparing Clustering to Persona Identification

Koos and Robin did research and analysis to identify personas using the site.
In what capacity do the clusters and personas match up

In [None]:
#comparison_df = pd.DataFrame([
#    seekers_data['user_id'], 
#    seekers_data['top_persona'], 
#    kmeans.labels_
#]) 

comparison_df = pd.DataFrame({
    'User ID': seekers_data['user_id'],
    'Top Persona': seekers_data['top_persona'],
    'Cluster' : kmeans.labels_
})
comparison_df = comparison_df
comparison_df.head()

Now I want to count the number of times each cluster matched a persona

In [None]:
comparison_df.groupby('Top Persona')['Cluster'].value_counts()

In [None]:
comparison_df.groupby('Cluster')['Top Persona'].value_counts()