In [270]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm 

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import NMF

pd.set_option('display.max_columns', None, 'display.max_rows', None)

In [271]:
df = pd.read_csv('../data/for_clustering_not_scaled.csv')

In [272]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2826 entries, 0 to 2825
Columns: 128 entries, moa1#1_1 to marriage4
dtypes: float64(128)
memory usage: 2.8 MB


In [273]:
# This will be used when grabbing verbose descriptions of columns
# In other words, the text of the questions
label_df = pd.read_csv('../data/labels_lower.csv')
label_df.tail(2)

Unnamed: 0,Variable Name,Question text,Survey Question ID
326,response_bias_sum,Sum of all Bias Dummy Variables,ADDED
327,school_coded,Cleaned Names of Schools Attended,ADDED


In [274]:
def get_question_text(colname, label_df=label_df):
    '''Get column description'''
    
    cond = label_df['Variable Name'] == colname
    idx = label_df.index[cond].tolist()[0]
    text = label_df.iloc[idx]['Question text']
    
    return text

In [275]:
# Lists of individual sections

moa = [c for c in df.columns if 'moa' in c]
idea = [c for c in df.columns if 'idea' in c]
swb = [c for c in df.columns if 'swb' in c]
mindful = [c for c in df.columns if 'mindful' in c]
belong = [c for c in df.columns if 'belong' in c]
efficacy = [c for c in df.columns if 'efficacy' in c]
support = [c for c in df.columns if 'support' in c]
transgres = [c for c in df.columns if 'transgres' in c]
exploit = [c for c in df.columns if 'exploit' in c]
stress = [c for c in df.columns if 'stress' in c]
marriage = [c for c in df.columns if 'marriage' in c]

socmedia = [c for c in df.columns if 'socmedia' in c]
usdream = [c for c in df.columns if 'usdream' in c]
demo = [c for c in df.columns if 'demo' in c]
disability = [c for c in df.columns if 'disability' in c]
phys = [c for c in df.columns if 'phys' in c]

In [278]:
def nmf_results(df, n_topics=5, top_ques=5, per_person=3, verbose=False): 
    
    X = StandardScaler(with_mean = False).fit_transform(df)    
    # Fit NMF to X
    nmf = NMF(n_components=n_topics, random_state=1,
              alpha=.3, l1_ratio=.5).fit(X)

    # W matrix maps each row (person) from input matrix to topic vector -> shape(m, n_topics)
    W = nmf.transform(X)

    # For each row/person, grab the column indices with highest values
    top_n_per_person = W.argsort()[:,-per_person:][:,::-1]

    # topics is the H matrix, mapping each topic to feature (question) -> shape (n_topics, p)
    topics = nmf.components_
    
    topic_questions = []
    
    for i, topic in enumerate(topics):
        topidxs = np.argsort(topic)[::-1][:top_ques]
        toplist = [(round(topic[i], 2), colname, get_question_text(colname)) 
                   for i, colname in zip(topidxs, df.columns[topidxs])]
        to_string = ['{} -> {} -> {}'.format(*tup) for tup in toplist]
        topques = '\n'.join(to_string)
        topic_questions.append(toplist)
        if verbose:
            print(f'*****Topic: {i}*****\n**Top Questions:\n{topques}\n')
        
    return nmf, topic_questions, top_n_per_person

In [280]:
nmf, topic_questions, top_n_per_person = nmf_results(df, n_topics=10, top_ques=10, 
                                                     per_person=3, verbose=True)

*****Topic: 0*****
**Top Questions:
8.22 -> moa2#1_4 -> Accept responsibility for your actions
7.88 -> moa2#1_1 -> Make independent decisions
6.47 -> idea_1 -> Is this period of your life a time of many possibilities?
6.24 -> efficacy_6 -> I can solve most problems if I invest the necessary effort.
6.08 -> efficacy_9 -> If I am in trouble, I can usually think of a solution.
6.06 -> idea_6 -> Is this period of your life a time of deciding your own beliefs and values?
5.98 -> idea_2 -> Is this period of your life a time of exploration?
5.86 -> efficacy_10 -> I can usually handle whatever comes my way.
5.85 -> idea_5 -> Is this period of your life a time of defining yourself?
5.66 -> efficacy_1 -> I can always manage to solve difficult problems if I try hard enough.

*****Topic: 1*****
**Top Questions:
6.99 -> moa2#1_4 -> Accept responsibility for your actions
6.44 -> moa2#1_1 -> Make independent decisions
5.28 -> idea_3 -> Is this period of your life a time of feeling stressed out?
5.27 

In [281]:
# # This cell will plot the number of students that end up associated with each topic
# # as the number of topics increases

# n=12

# fig, axes = plt.subplots(3, int(n/3), figsize=(19,14))

# for ax, num in zip(axes.flatten(), range(5,5+n)):
#     nmf, topic_questions, top_n_per_person = nmf_results(df, n_topics=num)
#     vals, counts = np.unique(top_n_per_person[:,0], return_counts=True)
#     plot_df = pd.DataFrame(counts, vals)
#     ax = plot_df.sort_values(0, ascending=False).plot(kind='bar', legend=False, ax=ax)
#     ax.set_title(f'Number of topics: {num}')
#     ax.set_ylabel('Number of students')

# # plt.suptitle('Students per topic over increasing number of topics', fontsize=20)
# plt.tight_layout()
# fig.savefig('../images/Students per topic over increasing number of topics.png')

In [231]:
# These are the top indices
top_n_per_person[:5]

array([[14,  0,  5],
       [11, 14,  0],
       [11,  1, 10],
       [ 0, 10, 12],
       [ 4,  1,  2]])

In [232]:
# This normalizes each row of W to percentages
W = nmf.transform(df)

normed = W / W.sum(axis=1)[:,None]
normed[:3]

array([[0.14426321, 0.02917455, 0.00935914, 0.0310487 , 0.06707158,
        0.10089733, 0.06361096, 0.05021277, 0.02028906, 0.0314057 ,
        0.05313322, 0.0832797 , 0.03682855, 0.09368168, 0.18574385],
       [0.13120387, 0.03334166, 0.00502066, 0.03330468, 0.09342924,
        0.02013273, 0.07799294, 0.01937285, 0.00930368, 0.08474768,
        0.09584007, 0.17806894, 0.05537885, 0.02520791, 0.13765424],
       [0.04445179, 0.18021275, 0.00546415, 0.00439868, 0.04181206,
        0.06102455, 0.01335613, 0.05766061, 0.01965853, 0.0546341 ,
        0.11731682, 0.22287152, 0.01339024, 0.0808742 , 0.08287386]])

In [233]:
col_idx = np.arange(W.shape[0])[:,None]

# These are the percentages associated with the top indices
top_perc_per_person = normed[col_idx, top_n_per_person]
top_perc_per_person[:3]

array([[0.18574385, 0.14426321, 0.10089733],
       [0.17806894, 0.13765424, 0.13120387],
       [0.22287152, 0.18021275, 0.11731682]])