# NLP on the Teacher Educator Survey
Goals: <br/>
- What are people talking about?
    - Topic model
- Are there any differences between comments by teachers and admin?
- What is the overall sentiment of the comments?
    - Does it differ by topic?
- Is there a correlation with other survey answers?


## Preliminary Findings

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline 

In [None]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings(action='once')


## Importing Survey Data

In [None]:
clean_survey_data = pd.read_stata('N:/Research and Policy/ORP_Data/Surveys/TES/Cleaned_Files/2018/2018TNEdSurveyResultsFile_7.7.18_NoDeIdentTchNum_Weights.dta')


In [None]:
clean_survey_data.head()

In [None]:
clean_survey_data.columns

## Selecting Columns to keep

In [None]:
cols_to_keep = ['tchlic', 'district_no', 'district_name', 'school_no', 'school_name','Gender', 'YrsExpr18', 'EdLevel18', 
                'Role_Compass', 'bestguess_tch', 'bestguess_admin', 'IPI_Sch', 'Tier_Sch', 'StartTime', 
                'EndTime', 'Responded', 'Q13']

In [None]:
df_selected_columns = clean_survey_data.loc[:,cols_to_keep]

In [None]:
df_selected_columns.info()

In [None]:
df_selected_columns.head()

## 71,636 surveys

### But how many answered the open ended question (Q13)

In [None]:
df_selected_columns.Q13.value_counts().head(15)

In [None]:
# Remove blank or answers which signify no answer
def label_q13 (row):
    if row['Q13'] not in ['no', 'NO', 'No', 'na', 'N/A', 'no.', 'No.', 'n/a', 'Na', 'none', 'Nope.', 'None.', '#NAME?', 
                          'None', 'Not at this time', 'Not at this time.', 'NA', '.', 'nothing', 'Nothing', 'x', 
                          'not at this time', 'None at this time', 'None at this time.', 'nope',
                         'No, thank you.', 'Nothing at this time', 'No thank you', 'No thank you.',
                         'No!', 'not at this time.', '-', 'N.A'] and len(row['Q13']) > 50 :
          return 1
    return 0

def length_response (row):
    return len(row['Q13'])

df_selected_columns['answered_q13'] = df_selected_columns.apply(label_q13, axis=1)
df_selected_columns['total_surveys_sent'] = df_selected_columns.shape[0]
df_selected_columns['total_number_responses'] = df_selected_columns.Responded.sum()
df_selected_columns['total_answered_q13'] = df_selected_columns.answered_q13.sum()
df_selected_columns['response_character_length'] = df_selected_columns.apply(length_response, axis = 1)

In [None]:
df_selected_columns.head(10)

- 71,636 surveys

- 40,876 responded 

- 12,175 answered Q13 (at least 50 characters)

In [None]:
print('Number of teachers surveyed: ' + str(sum(df_selected_columns['bestguess_tch'])))
print('Number of teachers responded: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'Responded'] == 1) & (df_selected_columns.loc[:,'bestguess_tch'] == 1), 'Responded'])))
print('Number of teachers answered Question 13: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'answered_q13'] == 1) & (df_selected_columns.loc[:,'bestguess_tch'] == 1), 'answered_q13'])))


In [None]:
print('Number of admins surveyed: ' + str(sum(df_selected_columns['bestguess_admin'])))
print('Number of admins responded: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'Responded'] == 1) & (df_selected_columns.loc[:,'bestguess_admin'] == 1), 'Responded'])))
print('Number of admins answered Question 13: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'answered_q13'] == 1) & (df_selected_columns.loc[:,'bestguess_admin'] == 1), 'answered_q13'])))


## Creating list of comments

In [None]:
# Converting the column of the DF with answers into a list of answers
open_ended_answers = df_selected_columns.loc[df_selected_columns['answered_q13'] == 1,:].Q13.tolist()

In [None]:
open_ended_answers[:5]

In [None]:
# Join all answers into one string
joined_answers = ' '.join(open_ended_answers)

## NLTK Tokenization

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

In [None]:
# Turn string into lowercase
# Then tokenize into words
tokens = [w for w in word_tokenize(joined_answers.lower()) if w.isalpha()]

In [None]:
# Show first 3 tokens (words)
tokens[:3]

In [None]:
# for each word, check if it is in the list of English stop words
# Only keep the words that are not in the list of stopwords
no_stops = [t for t in tokens
           if t not in stopwords.words('english')]

In [None]:
# Initiate Word Net Lemmatizer
# Will create lemmas (word bases)
wnl = nltk.WordNetLemmatizer()

In [None]:
# For each word in no_stops list, lemmatize
lemmas =  [wnl.lemmatize(t) for t in no_stops]

In [None]:
# Counter object 
Counter(lemmas).most_common(35)

## SpaCy Sentiment Exploration

In [None]:
# Import spacy for different analysis than NLTK can provide
import spacy
from spacy import displacy

In [None]:
import en_core_web_sm

In [None]:
# Load small English 
nlp = en_core_web_sm.load()

In [None]:
# Read first response for exploration
doc = nlp(open_ended_answers[0])

In [None]:
# Display entities from the first response
doc.ents

In [None]:
# answer_doc_list = [nlp(string) for string in open_ended_answers]

In [None]:
# Import AFINN for sentiment analysis
from afinn import Afinn
af = Afinn()

In [None]:
# For each reponse, give a sentiment score
sentiment_scores = [af.score(answer) for answer in open_ended_answers]

In [None]:
# Inspect first five sentiment scores
sentiment_scores[:5]

In [None]:
# Categorize the sentiment scores
# Above 0 -> Positive
# 0 -> Neutral
# Below 0 -> Negative
sentiment_category = ['positive' if score > 0 
                          else 'negative' if score < 0 
                              else 'neutral' 
                                  for score in sentiment_scores]

In [None]:
# Inspect first five sentiment categories
sentiment_category[:5]

In [None]:
# Selecting columns from the base DF where the open ended question was answered
df_open_answered = df_selected_columns.loc[df_selected_columns.loc[:,'answered_q13'] == 1, :]#, 'tchlic':'EndTime'] # +  'response_character_length']
                                           

In [None]:
# Inspecting the shape of the DF
df_open_answered.shape

In [None]:
# Looking at the first 5 rows
df_open_answered.head()

In [None]:
# Creating a DF with sentiment scores and categories
sentiment_df = pd.DataFrame({'sentiment_score': sentiment_scores,
                            'sentiment_category': sentiment_category})

In [None]:
# Inspecting shape to make sure it still matches up
sentiment_df.shape

In [None]:
# Inspecting first 5 rows of sentiment
sentiment_df.head()

In [None]:
# Concatenating the indentification columns with the sentiment dataframe to have
# all of the info in one place 
answers_w_sentiment_df = pd.concat([df_open_answered.reset_index(drop=True), sentiment_df], axis=1)

In [None]:
# Inspecting the first 5 rows
answers_w_sentiment_df.head()

In [None]:
# Inspecting characteristics of the DF
answers_w_sentiment_df.info()

In [None]:
answers_w_sentiment_df['sentiment_category'].value_counts()

- Negative: 3169
- Neutral: 2142
- Positive: 7612

## The reviews seem to be pretty positive. But from reading through the responses, the majority seem to be negative
- Maybe the context is throwing off the classifier

In [None]:
# Breaking down the responses into sentences instead of classifying the text from the whole comments
# all_sentences = [str(sent) for answer in open_ended_answers for sent  in nlp(answer).sents]

In [None]:
# Looking at the first 10 sentences
# all_sentences[:10]

In [None]:
# Looking at the length of the sentences list
# len(all_sentences)

- 65,920 sentences across all the reviews

In [None]:
# Using the first 10 sentences to quickly look at how the sentiment classifier 
# classifies them
# sentiment_first_10_sentences = [af.score(sent) for sent in all_sentences[:10]]

In [None]:
# Inspecting the score of the first 10 sentences
# sentiment_first_10_sentences

The classification of the sentences appears to be better than the classification of the the whole answers

In [None]:
# Scoring first 5 responses
# sentiment_first_5_answers = [af.score(sent) for sent in open_ended_answers[:5]]

In [None]:
# Looking at first 5 responses
# open_ended_answers[:5]

In [None]:
# Looking at the scores for the first 5
# sentiment_first_5_answers

## Gensim Exploration

#### Bigram creation

In [None]:
# Importing the relevant libraries
import gensim
from gensim import corpora
from pprint import pprint
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

In [None]:
# Splitting open_ended_answers list into words
# List of lists with words from each answer as elements
texts = [[text for text in doc.split()] for doc in open_ended_answers]

In [None]:
# texts[0]

In [None]:
# Creating gensim dictionary
dictionary = corpora.Dictionary(texts)

In [None]:
print(dictionary)

## 38,426 unique tokens

In [None]:
# print(dictionary.token2id)

In [None]:
# Creating corpus
# Corpus is unique ID for each word
# Tuples with format (word_id, word_frequency)
corpus = [dictionary.doc2bow(line) for line in texts]

In [None]:
# Creating bigram
# Min count = minimum number of times bigram must appear
# threshold = score threshold for forming the bigrams as scored by the gensim scorer
bigram = Phrases(texts, min_count = 3, threshold = 7)

In [None]:
# looking at bigram in first answer
print(bigram[texts[0]])

In [None]:
# Creating trigrams
trigram = Phrases(bigram[texts], threshold = 8)

In [None]:
# Looking at trigram of random response
print(trigram[bigram[texts[750]]])

## Topic Modeling

In [None]:
# Loading relevant libraries
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
import re
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
# logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'would', 'could', '.', '!', ',', ';', "n\'t", '(', ')', '?', 
                           'also', 'le', 'thank_you', 'u', 'etc']

In [None]:
# Make all words lowercase and remove punctuation
lower_no_punctuation = [[text for text in word_tokenize(doc.lower()) if text.isalpha()] for doc in open_ended_answers]

In [None]:
bigram_topics = Phrases(lower_no_punctuation, min_count = 3, threshold = 8)

In [None]:
print(bigram_topics[lower_no_punctuation[0]])

In [None]:
# Creating trigrams
trigram_topics = Phrases(bigram_topics[lower_no_punctuation], min_count = 2, threshold = 8)

In [None]:
print(trigram_topics[bigram_topics[lower_no_punctuation[0]]])

In [None]:
bigram_mod = Phraser(bigram_topics)
trigram_mod = Phraser(trigram_topics)

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
topic_trigrams = make_trigrams(lower_no_punctuation)

In [None]:
topic_trigram_no_stops = [[text for text in doc if text not in stop_words] for doc in topic_trigrams]

In [None]:
topic_lemmatized = [[wnl.lemmatize(text) for text in doc] for doc in topic_trigram_no_stops]

In [None]:
topic_lemmatized[3]

In [None]:
# create dictionary
id2word = corpora.Dictionary(topic_lemmatized)

In [None]:
# Create Corpus
texts = topic_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
print(corpus[:1])

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=300,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())

In [None]:
from gensim import similarities

In [None]:
lda_index = similarities.MatrixSimilarity(corpus, num_features=len(id2word))

In [None]:
# similarities = lda_index[lda_model[id2word]]

In [None]:
# vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [None]:
from gensim import models
tfidf = models.TfidfModel(corpus)

In [None]:
tfidf

In [None]:
import pyLDAvis.gensim

In [None]:
#lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, sort_topics=False)

In [None]:
# pyLDAvis.display(lda_display)

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def make_trigrams_mod(texts, trigram_model, bigram_model):
    return [trigram_model[bigram_model[doc]] for doc in texts]

In [None]:
# Outputs list of lists with trigrams for each answer in a list
def preprocess_responses(text_list, bigram_min_count = 3, bigram_threshold = 8, trigram_min_count = 2, trigram_threshold = 8):
    # Make all words lowercase and remove punctuation
    lower_no_punctuation = [[text for text in word_tokenize(doc.lower()) if text.isalpha()] for doc in text_list]
    # Establish stop words
    # Needs stopwords from nltk.corpus
    stop_words = stopwords.words('english')
    stop_words = stop_words + ['would', 'could', 'also', 'le', 'thank_you', 'u', 'etc', 't',
                              'the', 'and', 'are', 'of', 'for', 'that']
    # remove stop words
    text_no_stops = [[text for text in doc if text not in stop_words] for doc in lower_no_punctuation]
    # Lemmatize
    text_lemmatized = [[wnl.lemmatize(text) for text in doc] for doc in text_no_stops]
    # models for trigrams and bigrams
    bigram_topics = Phrases(text_lemmatized, min_count = bigram_min_count, threshold = bigram_threshold)
    trigram_topics = Phrases(bigram_topics[text_lemmatized], min_count = trigram_min_count, threshold = trigram_threshold)
    # Phraser for better performance
    bigram_mod = Phraser(bigram_topics)
    trigram_mod = Phraser(trigram_topics)
    # Make trigrams (also forms bigrams in the process)
    topic_trigrams = make_trigrams_mod(text_lemmatized, trigram_mod, bigram_mod)
    return topic_trigrams
    

In [None]:
def make_lda_topic_model(preprocessed_list, num_topics = 10, chunksize = 200, passes = 10):
    # create dictionary
    id2word = corpora.Dictionary(preprocessed_list)
    # Create Corpus
    texts = preprocessed_list
    corpus = [id2word.doc2bow(text) for text in texts] # Term Document Frequency
    # Created model with gensim LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=chunksize,
                                           passes=passes,
                                           alpha='auto',
                                           per_word_topics=True)
    return id2word, corpus, lda_model

In [None]:
def make_topic_visual(model, corpus, dictionary):
    lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)
    return pyLDAvis.display(lda_display)

In [None]:
answers = preprocess_responses(open_ended_answers, 
                               bigram_min_count = 3, bigram_threshold = 8, 
                               trigram_min_count = 2, trigram_threshold = 3)

In [None]:
dictionary, corpus, lda_model = make_lda_topic_model(answers)

In [None]:
pprint(lda_model.print_topics())

In [None]:
# make_topic_visual(lda_model, corpus, dictionary)

In [None]:
def total_lda_vis(response_list, 
                  bigram_min_count = 3, bigram_threshold = 8, 
                  trigram_min_count = 2, trigram_threshold = 3,
                  num_topics = 7, topic_chunksize = 200, passes = 10
                 ):
    trigram_list = preprocess_responses(response_list, 
                                        bigram_min_count = bigram_min_count, bigram_threshold = bigram_threshold, 
                                        trigram_min_count = trigram_min_count, trigram_threshold = trigram_threshold)
    dictionary, corpus, lda_model = make_lda_topic_model(trigram_list, 
                                                         num_topics = num_topics, chunksize = topic_chunksize, passes = passes)
    return make_topic_visual(lda_model, corpus, dictionary)

In [None]:
# total_lda_vis(open_ended_answers, 
#                   bigram_min_count = 3, bigram_threshold = 3, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 6, topic_chunksize = 300, passes = 8
#                  )

## Compare Different groups in survey

In [None]:
# df_selected_columns['Role_Compass'].value_counts()

In [None]:
def teacher_admin(row):
    if row['bestguess_tch'] == 1:
        return 'Teacher'
    else:
        return 'Admin'

In [None]:
df_open_answered.loc[:,'teacher_admin'] = df_open_answered.apply(teacher_admin, axis = 1)

In [None]:
df_open_answered.head()

In [None]:
df_open_answered.teacher_admin.value_counts()

In [None]:
teacher_answered_df = df_open_answered.loc[(df_open_answered['teacher_admin'] == 'Teacher'), :]

In [None]:
admin_answered_df = df_open_answered.loc[(df_open_answered['teacher_admin'] == 'Admin'), :]

In [None]:
print('Number of Teachers answering Q13: ' + str(teacher_answered_df.shape[0]))

In [None]:
print('Number of Admins answering Q13: ' + str(admin_answered_df.shape[0]))

In [None]:
teacher_comment_list = teacher_answered_df.Q13.tolist()

In [None]:
admin_comment_list = admin_answered_df.Q13.tolist()

In [None]:
admin_comment_list[:3]

## Admin Topic Model

In [None]:
# total_lda_vis(admin_comment_list, 
#                   bigram_min_count = 3, bigram_threshold = 8, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 75, passes = 8
#                  )

## Teacher Topic Model

In [None]:
# total_lda_vis(teacher_comment_list, 
#                   bigram_min_count = 3, bigram_threshold = 8, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 75, passes = 8
#                  )

## Textblob Sentiment Analysis

In [None]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [None]:
blob_list = [TextBlob(doc) for doc in open_ended_answers]

In [None]:
answer_sentiments = [blob.sentiment.polarity for blob in blob_list]

In [None]:
answer_sentiments[:5]

In [None]:
print('Average sentiment: ' + str(np.mean(answer_sentiments)))

The average sentiment seems to indicate that the reviews are positive overall

In [None]:
sent_category = ['positive' if sent > 0 else 'neutral' if sent == 0 else 'negative' for sent in answer_sentiments]

In [None]:
len(sent_category)

In [None]:
neg_inds = [True if sent < 0 else False for sent in answer_sentiments]

In [None]:
from itertools import compress

In [None]:
# Reviews marked as negative
negative_comment_list = list(compress(open_ended_answers, neg_inds))

In [None]:
textblob_sentiments_df = pd.DataFrame({'sentiment_polarity': answer_sentiments,
                                      'sentiment_category': sent_category})

In [None]:
textblob_total_df =  pd.concat([df_open_answered.reset_index(drop=True), textblob_sentiments_df], axis=1)

In [None]:
textblob_total_df.head()

In [None]:
textblob_total_df[['teacher_admin', 'sentiment_category', 'tchlic']].groupby(['teacher_admin', 'sentiment_category']).count()

In [None]:
textblob_total_df[['teacher_admin', 'sentiment_category', 'response_character_length']]\
                .groupby(['teacher_admin', 'sentiment_category']).mean().round(2)


- Neutral comments are much shorter on average than positive or negative


In [None]:
district_grouping = textblob_total_df[['district_no', 'district_name', 'sentiment_category', 'tchlic']]\
                            .groupby(['district_no', 'district_name', 'sentiment_category']).count()
district_grouping.columns = ['count']
district_grouping_pct = district_grouping.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).round(2)
district_grouping_pct.columns = ['pct']

In [None]:
district_grouping.head(10)

In [None]:
district_grouping_pct.head(10)

# Training classifier to see if the sentiments change

In [None]:
open_ended_answers[90:100]

## Topics for constructive compared with topics for destructive

In [None]:
const_comments = textblob_df_with_class.loc[textblob_df_with_class['classification'] == 'constructive', 'Q13'].tolist()

In [None]:
len(const_comments)

In [None]:
dest_comments = textblob_df_with_class.loc[textblob_df_with_class['classification'] == 'destructive', 'Q13'].tolist()

In [None]:
len(dest_comments)

In [None]:
# # Topics for comments labeled as Constructive
# total_lda_vis(const_comments, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 6, topic_chunksize = 100, passes = 8
#                  )

In [None]:
# # Topics for comments labeled as Destructive
# total_lda_vis(dest_comments, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 100, passes = 8
#                  )

## Notes
- Similar topics between the categories
- Portfolio occurs much more frequently in comments labeled as destructive
- Professional Development and Curriculum show up more in constructive comments
    - Request for state wide curriculum
    - Desire additional PD opportunities
- RTI appears in the constructive comments

## Constructive/destructive by district and school

### By District

In [None]:
district_class_grouping = textblob_df_with_class[['district_no', 'district_name', 'classification', 'tchlic']]\
                            .groupby(['district_no', 'district_name', 'classification']).count()
district_class_grouping.columns = ['count']
district_class_grouping_pct = district_class_grouping.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).round(2)
district_class_grouping_pct.columns = ['pct']
district_class_grouping_pct = district_class_grouping_pct.reset_index()

In [None]:
# district_class_grouping

In [None]:
district_class_grouping_pct[district_class_grouping_pct['classification'] == 'destructive']\
        .sort_values(by = ['classification', 'pct'], ascending = False)

In [None]:
textblob_df_with_class.loc[textblob_df_with_class['district_no'] == 690, 'Q13'].tolist()

In [None]:
district_class_grouping_pct[district_class_grouping_pct['classification'] == 'constructive']\
        .sort_values(by = ['classification', 'pct'], ascending = False)

## Mallet

In [None]:
import os
from gensim.models.wrappers import LdaMallet

os.environ['MALLET_HOME'] = 'C:\\Users\\ca20593\\mallet'

In [None]:
from gensim.models import CoherenceModel

In [None]:
mallet_path = 'C:/Users/ca20593/mallet/mallet-2.0.8/bin/' # update this path
# dictionary, corpus, lda_model
#ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary)

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary)

In [None]:
pprint(ldamallet.show_topics(formatted=False))

In [None]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=answers, dictionary=dictionary, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', round(coherence_ldamallet, 2))

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, 
                                                        texts=answers, start=5, limit=20, step=1)



In [None]:
# Show graph
limit=20; start=5; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
#plt.show()


In [None]:
coherence_values

## Years of experience comparison

In [None]:
def over_years_exp(yrs_exp):
    return textblob_df_with_class.loc[textblob_df_with_class['YrsExpr18'] >= yrs_exp, 'Q13'].tolist()
    
def under_years_exp(yrs_exp):
    return textblob_df_with_class.loc[textblob_df_with_class['YrsExpr18'] <= yrs_exp, 'Q13'].tolist()

In [None]:
over_25_answers = textblob_df_with_class.loc[textblob_df_with_class['YrsExpr18'] >= 25, 'Q13'].tolist()

In [None]:
len(over_25_answers)

In [None]:
# total_lda_vis(over_25_answers, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 100, passes = 8
#                  )

In [None]:
under_5_years = under_years_exp(5)

In [None]:
len(under_5_years)

In [None]:
# total_lda_vis(under_5_years, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 100, passes = 8
#                  )

## Sci Kit Learn TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.pipeline import make_pipeline

In [None]:
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_df = 0.85)

In [None]:
responses = tfidf.fit_transform(open_ended_answers)

In [None]:
words = tfidf.get_feature_names()

In [None]:
model = NMF(n_components=6)

In [None]:
model.fit(responses)

In [None]:
nmf_features = model.transform(responses)

In [None]:
df_features = pd.DataFrame(nmf_features)

In [None]:
df_features.head()

In [None]:
df_features.shape

In [None]:
components_df = pd.DataFrame(model.components_, columns = words)

In [None]:
components_df.shape

In [None]:
component = components_df.iloc[2,:]
print(component.nlargest(10))

In [None]:
norm_features = normalize(nmf_features)

In [None]:
norm_df = pd.DataFrame(norm_features)

In [None]:
norm_df.head()

In [None]:
norm_df.shape

In [None]:
rand_response = norm_df.iloc[8000]

In [None]:
similarities = norm_df.dot(rand_response)

In [None]:
similarities.sort_values(ascending = False).head(11)

In [None]:
similarities.sort_values(ascending = False).head(11).index.tolist()[1:]

In [None]:
print(similarities.nlargest())

In [None]:
open_ended_answers[8000]

In [None]:
open_ended_answers[11466]

In [None]:
def find_similar_responses(initial_ind,  answer_list, num_similar = 10, num_topics = 6):
    tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_df = 0.80)
    responses = tfidf.fit_transform(answer_list)
    model = NMF(n_components=num_topics)
    nmf_features = model.fit_transform(responses)
    norm_features = normalize(nmf_features)
    normalized_df = pd.DataFrame(norm_features)
    response = normalized_df.iloc[initial_ind]
    similar = normalized_df.dot(response)
    similar_ind_list = similar.sort_values(ascending = False).index.tolist()[0:num_similar+1]
    return [answer_list[ind] for ind in similar_ind_list]

In [None]:
find_similar_responses(4657, open_ended_answers, num_similar = 3)

In [None]:
def show_topic_words(text_list, num_topics, num_words = 10):
    tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_df = 0.80, ngram_range = (1,1))
    responses = tfidf.fit_transform(text_list)
    words = tfidf.get_feature_names()
    model = NMF(n_components=num_topics)
    nmf_features = model.fit_transform(responses)
    norm_features = normalize(nmf_features)
    components_df = pd.DataFrame(model.components_, columns = words)
    for i in range(num_topics):
        component = components_df.iloc[i,:]
        print(component.nlargest(num_words))
        print('\n')
    normalized_df = pd.DataFrame(norm_features)
    return normalized_df

In [None]:
normalized_df = show_topic_words(open_ended_answers, 7, num_words = 10)

In [None]:
topic_2_list = norm_df.sort_values(by = 2, ascending = False).index.tolist()[:30]

In [None]:
[open_ended_answers[i] for i in topic_2_list]

In [None]:
topic_0_list = norm_df.sort_values(by = 5, ascending = False).index.tolist()[:30]

In [None]:
[open_ended_answers[i] for i in topic_0_list]

In [None]:
def answers_from_topic(answer_list,normalized_df, topic_num, answer_count = 10, topic_prop = 1):
    df_topics = normalized_df.loc[normalized_df.max(axis = 1) < topic_prop]
    topic_list = df_topics.sort_values(by = topic_num, ascending = False).index.tolist()[:answer_count]
    return [answer_list[i] for i in topic_list]

# Widget Testing

In [None]:
from ipywidgets import Button, Layout, Box, interactive, fixed, HBox, Label

In [None]:
import ipywidgets as widgets

In [None]:
b = Button(description='(50% width, 80px height) button',
           layout=Layout(width='50%', height='80px'))
b


In [None]:
w = widgets.IntSlider()
display(w)

In [None]:
w.value

In [None]:
widgets.Dropdown(
    options=range(20),
    value=2,
    description='Number:',
    disabled=False,
)

In [None]:
# show_topic_words(text_list, num_topics, num_words = 10)
# show_topic_words(open_ended_answers, 7, num_words = 10)
y = interactive(show_topic_words, text_list = fixed(open_ended_answers),
                num_topics = range(1,21), num_words = range(5,16))

In [None]:
display(y)

In [None]:
norm_df = show_topic_words(open_ended_answers, y.children[0].value, num_words = y.children[1].value);

In [None]:
y.children[0].value

In [None]:
def answers_from_topic_widget(answer_list,normalized_df, topic_num, answer_count = 10, topic_prop = 1):
    # df_topics = normalized_df.loc[normalized_df.max(axis = 1) < topic_prop]
    df_topics = normalized_df.loc[normalized_df.iloc[:,topic_num] < topic_prop]
    topic_list = df_topics.sort_values(by = topic_num, ascending = False).index.tolist()[:answer_count]
    display( [answer_list[i] for i in topic_list] )

In [None]:
# answers_from_topic(answer_list,normalized_df, topic_num, answer_count = 10, topic_prop = 1)
z = interactive(answers_from_topic_widget,normalized_df = fixed(norm_df), answer_list = fixed(open_ended_answers),
                topic_num = range(0,y.children[0].value), answer_count = range(2,21), 
                topic_prop = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])

In [None]:
display(z)
# z.result