# LDA for qualitative groups

This code generates the initial lda model that will eventually be used by the LPD. It requires a human to interpret the topics. You may want to run this multiple times until the topics become interpretable - once they do, save the lda model.

In [0]:
# libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
import random
from nltk.tokenize import word_tokenize
from sklearn.externals import joblib

**Functions**



In [0]:
# clean docs (lowercase, remove punctuation)
def clean_docs(s):
    s = s.lower()
    s = re.sub(r'[^\w]', ' ', s)
    return s

# clean and make docs
def make_docs(cols,df):
    docs = []
    rows = df.shape[0]
    small = df[cols]
    t = ["teaching","teacher","teachers"] # will reduce all instances of "teach-" to "teach" 
    
    for row in range(2,rows):
        temp = [str(i) for i in small.loc[row] if len(str(i)) > 3]
        joined = " ".join(temp)
        clean = clean_docs(joined)
        
        words = word_tokenize(clean)
        
        cleaner = [i if i not in t else "teach" for i in words]
        cleaner = " ".join(cleaner)

        docs.append(cleaner)
        
    return docs

In [0]:
# Read in the data, identify columns of interest

data = pd.read_csv("hplsurvey.csv")

# IMPORTANT: 'qual_cols' needs to be re-written with the names of the columns from the LPD
# current column names are just from qualtrics.
qual_cols = ['Q3','Q5','Q6','Q15','Q36','Q36_5_TEXT','Q46',
 'Q10',
 'Q11',
 'Q12','Q18',
 'Q19',
 'Q20']

data.reset_index(inplace=True)

# added student answers about jobs (technically multiple choice, but gives important info)
plus_job = qual_cols+['Q15','Q12','Q46']
qualitative_data = data[plus_job]

# NAs will cause errors
qualitative_data.fillna("",inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [0]:
# combine answers of all respondents so we can do LDA
docs = make_docs(list(qualitative_data.columns),qualitative_data)

In [0]:
# create a tfidf matrix (sparse word frequency matrix)
tfidf = TfidfVectorizer(max_features = 1000,stop_words = 'english')

# add words to ignore (eg, if 'education' is not ignored, it dominates all topics)
# this list of words should change each year (ie when the code is rerun for 2019, etc)
stops = list(tfidf.get_stop_words()) +['education','students','students','school','learning','learn','gt','experience',"teach","working"]
tfidf.set_params(stop_words=stops)
X = tfidf.fit_transform(docs)

In [0]:
tfidf_names = tfidf.get_feature_names()

lda = LatentDirichletAllocation(n_topics = 7, max_iter = 10, learning_method='online',learning_offset=.5)

# Because lda works on randomness, you might want to know what random seed python was using, for reproducability.
print(np.random.get_state()[1][0])
group_probs = lda.fit_transform(X)

232220589




In [0]:
# This is how you will make the group tags. Requires a human to make sense of the topics.
# This also where you may notice words you want to include in the stopword list.
topics = {}
for topic_idx, topic in enumerate(lda.components_):
    topics["Topic{}".format(topic_idx)] = " ".join([tfidf_names[i]
        for i in topic.argsort()[:-10 - 1:-1]])
topics

{'Topic0': 'facilitating creating independently people established promote helping media organization institution',
 'Topic1': 'directly creating term justice social knowledge learners equity support harvard',
 'Topic2': 'early research technology want creating based design impact like tools',
 'Topic3': 'educator cried strain background lifetime aspect potential effective slowing sustainable',
 'Topic4': 'evaluation equity colleges step leadership programs methods promoting justice making',
 'Topic5': 'directly learners making creating promoting equity producing make succeed inspiring',
 'Topic6': 'leadership want student institutions social higher ways development manage management'}

In [0]:
# save the model
# protocol 2 is compatible with python 2.x
# lda.pkl is the file name, you can point to whatever file path
joblib.dump(lda, 'lda.pkl', protocol = 2) 

['lda.pkl']