# NLP on the Teacher Educator Survey
Goals: <br/>
- What are people talking about?
    - Topic model
- Are there any differences between comments by teachers and admin?
- What is the overall sentiment of the comments?
    - Does it differ by topic?
- Is there a correlation with other survey answers?


## Preliminary Findings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline 

In [2]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings(action='once')


## Importing Survey Data

In [3]:
clean_survey_data = pd.read_stata('N:/Research and Policy/ORP_Data/Surveys/TES/Cleaned_Files/2018/2018TNEdSurveyResultsFile_7.7.18_NoDeIdentTchNum_Weights.dta')


In [4]:
clean_survey_data.head()

Unnamed: 0,tchlic,Email,sch_id,district_no,school_no,district_name,school_name,LastName,FirstName,MiddleName,...,ADM_AM_T_FINWT51,ADM_AM_T_FINWT52,ADM_AM_T_FINWT53,ADM_AM_T_FINWT54,ADM_AM_T_FINWT55,ADM_AM_T_FINWT56,ADM_AM_T_FINWT57,ADM_AM_T_FINWT58,ADM_AM_T_FINWT59,ADM_AM_T_FINWT60
0,17364,tom.zachary@claibornecsd.org,91300035,130,35,Claiborne County,Clairfield Elementary,ZACHARY,THOMAS,,...,,,,,,,,,,
1,17522,mmorrow@mauryk12.org,96000030,600,30,Maury County,Culleoka Unit School,MORROW,MARY,,...,,,,,,,,,,
2,18016,wrightj@wcde.org,99000115,900,115,Washington County,Tennessee Virtual Learning Academy,WRIGHT,JANE,G,...,,,,,,,,,,
3,20735,eleanorsatterfield@tcschools.org,98500015,850,15,Trousdale County,Jim Satterfield Middle School,SATTERFIELD,ELEANOR,W,...,,,,,,,,,,
4,32910,glenda.akin@sumnerschools.org,98300120,830,120,Sumner County,Westmoreland High School,AKIN,GLENDA,J,...,,,,,,,,,,


In [5]:
clean_survey_data.columns

Index(['tchlic', 'Email', 'sch_id', 'district_no', 'school_no',
       'district_name', 'school_name', 'LastName', 'FirstName', 'MiddleName',
       ...
       'ADM_AM_T_FINWT51', 'ADM_AM_T_FINWT52', 'ADM_AM_T_FINWT53',
       'ADM_AM_T_FINWT54', 'ADM_AM_T_FINWT55', 'ADM_AM_T_FINWT56',
       'ADM_AM_T_FINWT57', 'ADM_AM_T_FINWT58', 'ADM_AM_T_FINWT59',
       'ADM_AM_T_FINWT60'],
      dtype='object', length=1495)

## Selecting Columns to keep

In [6]:
cols_to_keep = ['tchlic', 'district_no', 'district_name', 'school_no', 'school_name','Gender', 'YrsExpr18', 'EdLevel18', 
                'Role_Compass', 'bestguess_tch', 'bestguess_admin', 'IPI_Sch', 'Tier_Sch', 'StartTime', 
                'EndTime', 'Responded', 'Q13']

In [7]:
df_selected_columns = clean_survey_data.loc[:,cols_to_keep]

In [8]:
df_selected_columns.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71636 entries, 0 to 71635
Data columns (total 17 columns):
tchlic             71636 non-null int32
district_no        71636 non-null int16
district_name      71636 non-null object
school_no          71636 non-null int16
school_name        71636 non-null object
Gender             71636 non-null object
YrsExpr18          70664 non-null float64
EdLevel18          71636 non-null object
Role_Compass       71636 non-null object
bestguess_tch      71636 non-null int8
bestguess_admin    71636 non-null int8
IPI_Sch            71636 non-null int8
Tier_Sch           71636 non-null category
StartTime          40876 non-null datetime64[ns]
EndTime            40876 non-null datetime64[ns]
Responded          71636 non-null int8
Q13                71636 non-null object
dtypes: category(1), datetime64[ns](2), float64(1), int16(2), int32(1), int8(4), object(6)
memory usage: 6.4+ MB


In [9]:
df_selected_columns.head()

Unnamed: 0,tchlic,district_no,district_name,school_no,school_name,Gender,YrsExpr18,EdLevel18,Role_Compass,bestguess_tch,bestguess_admin,IPI_Sch,Tier_Sch,StartTime,EndTime,Responded,Q13
0,17364,130,Claiborne County,35,Clairfield Elementary,M,31.0,MA+,Principal,0,1,0,K-8,2018-03-06 07:40:00,2018-03-06 08:10:00,1,No
1,17522,600,Maury County,30,Culleoka Unit School,F,,MA+,,1,0,0,K-12,NaT,NaT,0,
2,18016,900,Washington County,115,Tennessee Virtual Learning Academy,F,,MA+,,1,0,0,9 - 12,2018-03-08 07:43:00,2018-03-08 09:07:00,1,
3,20735,850,Trousdale County,15,Jim Satterfield Middle School,F,46.0,MA,Teacher,1,0,0,5 - 8,2018-03-06 08:28:00,2018-03-06 08:45:00,1,
4,32910,830,Sumner County,120,Westmoreland High School,F,54.0,MA+,Teacher,1,0,0,9 - 12,NaT,NaT,0,


## 71,636 surveys

### But how many answered the open ended question (Q13)

In [10]:
df_selected_columns.Q13.value_counts().head(15)

                      56969
no                      461
No                      376
N/A                     201
n/a                     118
none                     64
None                     63
Not at this time.        62
No.                      59
NA                       57
na                       41
Not at this time         37
.                        29
None at this time.       24
not at this time         23
Name: Q13, dtype: int64

In [11]:
# Remove blank or answers which signify no answer
def label_q13 (row):
    if row['Q13'] not in ['no', 'NO', 'No', 'na', 'N/A', 'no.', 'No.', 'n/a', 'Na', 'none', 'Nope.', 'None.', '#NAME?', 
                          'None', 'Not at this time', 'Not at this time.', 'NA', '.', 'nothing', 'Nothing', 'x', 
                          'not at this time', 'None at this time', 'None at this time.', 'nope',
                         'No, thank you.', 'Nothing at this time', 'No thank you', 'No thank you.',
                         'No!', 'not at this time.', '-', 'N.A'] and len(row['Q13']) > 50 :
          return 1
    return 0

def length_response (row):
    return len(row['Q13'])

df_selected_columns['answered_q13'] = df_selected_columns.apply(label_q13, axis=1)
df_selected_columns['total_surveys_sent'] = df_selected_columns.shape[0]
df_selected_columns['total_number_responses'] = df_selected_columns.Responded.sum()
df_selected_columns['total_answered_q13'] = df_selected_columns.answered_q13.sum()
df_selected_columns['response_character_length'] = df_selected_columns.apply(length_response, axis = 1)

In [12]:
df_selected_columns.head(10)

Unnamed: 0,tchlic,district_no,district_name,school_no,school_name,Gender,YrsExpr18,EdLevel18,Role_Compass,bestguess_tch,...,Tier_Sch,StartTime,EndTime,Responded,Q13,answered_q13,total_surveys_sent,total_number_responses,total_answered_q13,response_character_length
0,17364,130,Claiborne County,35,Clairfield Elementary,M,31.0,MA+,Principal,0,...,K-8,2018-03-06 07:40:00,2018-03-06 08:10:00,1,No,0,71636,40876,12175,2
1,17522,600,Maury County,30,Culleoka Unit School,F,,MA+,,1,...,K-12,NaT,NaT,0,,0,71636,40876,12175,0
2,18016,900,Washington County,115,Tennessee Virtual Learning Academy,F,,MA+,,1,...,9 - 12,2018-03-08 07:43:00,2018-03-08 09:07:00,1,,0,71636,40876,12175,0
3,20735,850,Trousdale County,15,Jim Satterfield Middle School,F,46.0,MA,Teacher,1,...,5 - 8,2018-03-06 08:28:00,2018-03-06 08:45:00,1,,0,71636,40876,12175,0
4,32910,830,Sumner County,120,Westmoreland High School,F,54.0,MA+,Teacher,1,...,9 - 12,NaT,NaT,0,,0,71636,40876,12175,0
5,34347,180,Cumberland County,25,Cumberland County High School,F,52.0,MA,Teacher,1,...,9 - 12,2018-03-06 19:14:00,2018-03-06 20:09:00,1,Each year there seems to be more added to the ...,1,71636,40876,12175,766
6,35973,500,Lawrence County,40,Lawrence Co High School,F,39.0,MA+,Teacher,1,...,9 - 12,2018-03-13 14:06:00,2018-03-13 14:33:00,1,No,0,71636,40876,12175,2
7,36435,320,Hamblen County,35,Whitesburg Elementary,M,53.0,MA+,Principal,0,...,K-5,2018-03-06 08:10:00,2018-03-06 08:53:00,1,Give TCAP test in all grades. Give pre- and po...,1,71636,40876,12175,448
8,37252,420,Houston County,5,Erin Elementary,F,52.0,MA+,Teacher,1,...,K-5,2018-03-07 09:02:00,2018-03-07 09:24:00,1,,0,71636,40876,12175,0
9,38023,150,Cocke County,15,Cocke Co High School,F,51.0,MA,Teacher,1,...,9 - 12,NaT,NaT,0,,0,71636,40876,12175,0


- 71,636 surveys

- 40,876 responded 

- 12,175 answered Q13 (at least 50 characters)

In [13]:
print('Number of teachers surveyed: ' + str(sum(df_selected_columns['bestguess_tch'])))
print('Number of teachers responded: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'Responded'] == 1) & (df_selected_columns.loc[:,'bestguess_tch'] == 1), 'Responded'])))
print('Number of teachers answered Question 13: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'answered_q13'] == 1) & (df_selected_columns.loc[:,'bestguess_tch'] == 1), 'answered_q13'])))


Number of teachers surveyed: 67483
Number of teachers responded: 38665
Number of teachers answered Question 13: 11652


In [14]:
print('Number of admins surveyed: ' + str(sum(df_selected_columns['bestguess_admin'])))
print('Number of admins responded: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'Responded'] == 1) & (df_selected_columns.loc[:,'bestguess_admin'] == 1), 'Responded'])))
print('Number of admins answered Question 13: ' + str(sum(df_selected_columns.loc[(df_selected_columns.loc[:,'answered_q13'] == 1) & (df_selected_columns.loc[:,'bestguess_admin'] == 1), 'answered_q13'])))


Number of admins surveyed: 3113
Number of admins responded: 1791
Number of admins answered Question 13: 477


## Creating list of comments

In [15]:
# Converting the column of the DF with answers into a list of answers
open_ended_answers = df_selected_columns.loc[df_selected_columns['answered_q13'] == 1,:].Q13.tolist()

In [16]:
open_ended_answers[:5]

['Each year there seems to be more added to the teachers plate, making it more difficult to focus on teaching.  Teachers seem to be more stressed than ever before.  The constant changing of testing and evaluations have certainly contributed to this stress.  The state testing seems to be unfair in many ways.  Some teachers have never taught a tested subject and have not had the stress that the tested teachers have.  At the same time, their evaluations are determined, in part, by the success or lack of success of other teachers which is unfair to them.  Some tested subjects are scheduled for a full year, allowing sufficient time to cover all of the state standards while other teachers of tested subjects have only a semester to teach a similar set of standards.',
 'Give TCAP test in all grades. Give pre- and post in k. Give post in every other grade. This would give a more accurate growth score for all teachers. If growth scores are important then give teachers their OWN SCORES. Giving te

In [17]:
# Join all answers into one string
joined_answers = ' '.join(open_ended_answers)

## NLTK Tokenization

In [18]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""


In [19]:
# Turn string into lowercase
# Then tokenize into words
tokens = [w for w in word_tokenize(joined_answers.lower()) if w.isalpha()]

In [20]:
# Show first 3 tokens (words)
tokens[:3]

['each', 'year', 'there']

In [21]:
# for each word, check if it is in the list of English stop words
# Only keep the words that are not in the list of stopwords
no_stops = [t for t in tokens
           if t not in stopwords.words('english')]

In [22]:
# Initiate Word Net Lemmatizer
# Will create lemmas (word bases)
wnl = nltk.WordNetLemmatizer()

In [23]:
# For each word in no_stops list, lemmatize
lemmas =  [wnl.lemmatize(t) for t in no_stops]

In [24]:
# Counter object 
Counter(lemmas).most_common(35)

[('student', 16219),
 ('teacher', 14853),
 ('school', 10224),
 ('time', 7900),
 ('need', 6553),
 ('year', 5870),
 ('test', 5406),
 ('state', 4527),
 ('testing', 4182),
 ('feel', 3591),
 ('would', 3491),
 ('standard', 3302),
 ('teaching', 3037),
 ('many', 2912),
 ('teach', 2907),
 ('much', 2883),
 ('like', 2765),
 ('work', 2703),
 ('classroom', 2579),
 ('one', 2569),
 ('grade', 2559),
 ('score', 2518),
 ('level', 2458),
 ('education', 2453),
 ('also', 2397),
 ('get', 2271),
 ('day', 2253),
 ('evaluation', 2218),
 ('district', 2199),
 ('way', 2195),
 ('child', 2071),
 ('make', 2020),
 ('u', 1975),
 ('take', 1908),
 ('know', 1823)]

## SpaCy Sentiment Exploration

In [26]:
# Import spacy for different analysis than NLTK can provide
import spacy
from spacy import displacy

In [27]:
import en_core_web_sm

In [28]:
# Load small English 
nlp = en_core_web_sm.load()

In [29]:
# Read first response for exploration
doc = nlp(open_ended_answers[0])

In [30]:
# Display entities from the first response
doc.ents

(Each year, a full year)

In [31]:
# answer_doc_list = [nlp(string) for string in open_ended_answers]

In [32]:
# Import AFINN for sentiment analysis
from afinn import Afinn
af = Afinn()

In [33]:
# For each reponse, give a sentiment score
sentiment_scores = [af.score(answer) for answer in open_ended_answers]

In [34]:
# Inspect first five sentiment scores
sentiment_scores[:5]

[-5.0, 14.0, 0.0, 3.0, 0.0]

In [35]:
# Categorize the sentiment scores
# Above 0 -> Positive
# 0 -> Neutral
# Below 0 -> Negative
sentiment_category = ['positive' if score > 0 
                          else 'negative' if score < 0 
                              else 'neutral' 
                                  for score in sentiment_scores]

In [36]:
# Inspect first five sentiment categories
sentiment_category[:5]

['negative', 'positive', 'neutral', 'positive', 'neutral']

In [37]:
# Selecting columns from the base DF where the open ended question was answered
df_open_answered = df_selected_columns.loc[df_selected_columns.loc[:,'answered_q13'] == 1, :]#, 'tchlic':'EndTime'] # +  'response_character_length']
                                           

In [38]:
# Inspecting the shape of the DF
df_open_answered.shape

(12175, 22)

In [39]:
# Looking at the first 5 rows
df_open_answered.head()

Unnamed: 0,tchlic,district_no,district_name,school_no,school_name,Gender,YrsExpr18,EdLevel18,Role_Compass,bestguess_tch,...,Tier_Sch,StartTime,EndTime,Responded,Q13,answered_q13,total_surveys_sent,total_number_responses,total_answered_q13,response_character_length
5,34347,180,Cumberland County,25,Cumberland County High School,F,52.0,MA,Teacher,1,...,9 - 12,2018-03-06 19:14:00,2018-03-06 20:09:00,1,Each year there seems to be more added to the ...,1,71636,40876,12175,766
7,36435,320,Hamblen County,35,Whitesburg Elementary,M,53.0,MA+,Principal,0,...,K-5,2018-03-06 08:10:00,2018-03-06 08:53:00,1,Give TCAP test in all grades. Give pre- and po...,1,71636,40876,12175,448
17,43604,70,Campbell County,75,Jellico High School,F,51.0,BA,Teacher,1,...,9 - 12,2018-03-08 13:23:00,2018-03-08 14:03:00,1,Testing has replaced learning and teaching. It...,1,71636,40876,12175,93
27,48309,630,Montgomery County,19,Kenwood High,M,39.0,MA+,Teacher,1,...,9 - 12,2018-03-06 05:53:00,2018-03-20 06:09:00,1,Testing has become the driving force of educat...,1,71636,40876,12175,225
30,49926,80,Cannon County,30,Short Mountain Elementary,M,50.0,MA,Principal,0,...,K-8,2018-03-13 09:53:00,2018-03-13 10:51:00,1,The principal evaluation is too wordy and comp...,1,71636,40876,12175,244


In [40]:
# Creating a DF with sentiment scores and categories
sentiment_df = pd.DataFrame({'sentiment_score': sentiment_scores,
                            'sentiment_category': sentiment_category})

In [41]:
# Inspecting shape to make sure it still matches up
sentiment_df.shape

(12175, 2)

In [42]:
# Inspecting first 5 rows of sentiment
sentiment_df.head()

Unnamed: 0,sentiment_score,sentiment_category
0,-5.0,negative
1,14.0,positive
2,0.0,neutral
3,3.0,positive
4,0.0,neutral


In [43]:
# Concatenating the indentification columns with the sentiment dataframe to have
# all of the info in one place 
answers_w_sentiment_df = pd.concat([df_open_answered.reset_index(drop=True), sentiment_df], axis=1)

In [44]:
# Inspecting the first 5 rows
answers_w_sentiment_df.head()

Unnamed: 0,tchlic,district_no,district_name,school_no,school_name,Gender,YrsExpr18,EdLevel18,Role_Compass,bestguess_tch,...,EndTime,Responded,Q13,answered_q13,total_surveys_sent,total_number_responses,total_answered_q13,response_character_length,sentiment_score,sentiment_category
0,34347,180,Cumberland County,25,Cumberland County High School,F,52.0,MA,Teacher,1,...,2018-03-06 20:09:00,1,Each year there seems to be more added to the ...,1,71636,40876,12175,766,-5.0,negative
1,36435,320,Hamblen County,35,Whitesburg Elementary,M,53.0,MA+,Principal,0,...,2018-03-06 08:53:00,1,Give TCAP test in all grades. Give pre- and po...,1,71636,40876,12175,448,14.0,positive
2,43604,70,Campbell County,75,Jellico High School,F,51.0,BA,Teacher,1,...,2018-03-08 14:03:00,1,Testing has replaced learning and teaching. It...,1,71636,40876,12175,93,0.0,neutral
3,48309,630,Montgomery County,19,Kenwood High,M,39.0,MA+,Teacher,1,...,2018-03-20 06:09:00,1,Testing has become the driving force of educat...,1,71636,40876,12175,225,3.0,positive
4,49926,80,Cannon County,30,Short Mountain Elementary,M,50.0,MA,Principal,0,...,2018-03-13 10:51:00,1,The principal evaluation is too wordy and comp...,1,71636,40876,12175,244,0.0,neutral


In [45]:
# Inspecting characteristics of the DF
answers_w_sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12175 entries, 0 to 12174
Data columns (total 24 columns):
tchlic                       12175 non-null int32
district_no                  12175 non-null int16
district_name                12175 non-null object
school_no                    12175 non-null int16
school_name                  12175 non-null object
Gender                       12175 non-null object
YrsExpr18                    12121 non-null float64
EdLevel18                    12175 non-null object
Role_Compass                 12175 non-null object
bestguess_tch                12175 non-null int8
bestguess_admin              12175 non-null int8
IPI_Sch                      12175 non-null int8
Tier_Sch                     12175 non-null category
StartTime                    12175 non-null datetime64[ns]
EndTime                      12175 non-null datetime64[ns]
Responded                    12175 non-null int8
Q13                          12175 non-null object
answered_q13    

In [46]:
answers_w_sentiment_df['sentiment_category'].value_counts()

positive    7346
negative    3072
neutral     1757
Name: sentiment_category, dtype: int64

- Negative: 3169
- Neutral: 2142
- Positive: 7612

## The reviews seem to be pretty positive. But from reading through the responses, the majority seem to be negative
- Maybe the context is throwing off the classifier

In [47]:
# Breaking down the responses into sentences instead of classifying the text from the whole comments
# all_sentences = [str(sent) for answer in open_ended_answers for sent  in nlp(answer).sents]

In [48]:
# Looking at the first 10 sentences
# all_sentences[:10]

In [49]:
# Looking at the length of the sentences list
# len(all_sentences)

- 65,920 sentences across all the reviews

In [50]:
# Using the first 10 sentences to quickly look at how the sentiment classifier 
# classifies them
# sentiment_first_10_sentences = [af.score(sent) for sent in all_sentences[:10]]

In [51]:
# Inspecting the score of the first 10 sentences
# sentiment_first_10_sentences

The classification of the sentences appears to be better than the classification of the the whole answers

In [52]:
# Scoring first 5 responses
# sentiment_first_5_answers = [af.score(sent) for sent in open_ended_answers[:5]]

In [53]:
# Looking at first 5 responses
# open_ended_answers[:5]

In [54]:
# Looking at the scores for the first 5
# sentiment_first_5_answers

## Gensim Exploration

#### Bigram creation

In [55]:
# Importing the relevant libraries
import gensim
from gensim import corpora
from pprint import pprint
from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, Phraser

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [56]:
# Splitting open_ended_answers list into words
# List of lists with words from each answer as elements
texts = [[text for text in doc.split()] for doc in open_ended_answers]

In [57]:
# texts[0]

In [58]:
# Creating gensim dictionary
dictionary = corpora.Dictionary(texts)

In [59]:
print(dictionary)

Dictionary(38426 unique tokens: ['At', 'Each', 'Some', 'Teachers', 'The']...)


## 38,426 unique tokens

In [60]:
# print(dictionary.token2id)

In [61]:
# Creating corpus
# Corpus is unique ID for each word
# Tuples with format (word_id, word_frequency)
corpus = [dictionary.doc2bow(line) for line in texts]

In [62]:
# Creating bigram
# Min count = minimum number of times bigram must appear
# threshold = score threshold for forming the bigrams as scored by the gensim scorer
bigram = Phrases(texts, min_count = 3, threshold = 7)

In [63]:
# looking at bigram in first answer
print(bigram[texts[0]])

['Each_year', 'there', 'seems', 'to', 'be', 'more', 'added', 'to', 'the', 'teachers', 'plate,', 'making', 'it', 'more', 'difficult', 'to', 'focus_on', 'teaching.', 'Teachers', 'seem', 'to', 'be', 'more', 'stressed', 'than_ever', 'before.', 'The', 'constant', 'changing', 'of', 'testing', 'and', 'evaluations', 'have', 'certainly', 'contributed', 'to', 'this', 'stress.', 'The', 'state_testing', 'seems', 'to', 'be', 'unfair', 'in', 'many_ways.', 'Some', 'teachers', 'have', 'never_taught', 'a', 'tested_subject', 'and', 'have', 'not', 'had', 'the', 'stress', 'that', 'the', 'tested', 'teachers', 'have.', 'At', 'the_same', 'time,', 'their', 'evaluations', 'are', 'determined,', 'in', 'part,', 'by', 'the', 'success', 'or', 'lack_of', 'success', 'of', 'other', 'teachers', 'which', 'is_unfair', 'to', 'them.', 'Some', 'tested_subjects', 'are', 'scheduled', 'for', 'a', 'full', 'year,', 'allowing', 'sufficient_time', 'to', 'cover_all', 'of', 'the', 'state', 'standards', 'while_other', 'teachers', 'of

In [64]:
# Creating trigrams
trigram = Phrases(bigram[texts], threshold = 8)

In [65]:
# Looking at trigram of random response
print(trigram[bigram[texts[750]]])

['We_have', 'one', 'of', 'the', 'best', 'principals', 'I', 'have', 'ever_had.', 'I_would_prefer', 'that', 'the', 'stakes', 'of', 'the', 'state_test', 'not', 'be', 'so', 'high.', 'There_is_too_much', 'pressure_on', 'students_who', 'cannot_keep', 'pace', 'in', 'the', 'curriculum.', 'Extended', 'resource', 'would_be', 'very_helpful', 'for', 'those', 'few', 'students_who', 'do_not', 'have', 'the', 'intellectual', 'maturity/capacity', 'to', 'keep_pace', 'with', 'the', 'general_population.', 'Students', 'performing', '2', 'to', '4_years', 'below', 'their_peers', 'need', 'extended', 'time', 'to', 'close', 'the', 'gap', 'in', 'reading', 'and', 'math.']


## Topic Modeling

In [66]:
# Loading relevant libraries
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess, lemmatize
import re
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
# logging.root.setLevel(level=logging.INFO)
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'would', 'could', '.', '!', ',', ';', "n\'t", '(', ')', '?', 
                           'also', 'le', 'thank_you', 'u', 'etc']

In [67]:
# Make all words lowercase and remove punctuation
lower_no_punctuation = [[text for text in word_tokenize(doc.lower()) if text.isalpha()] for doc in open_ended_answers]

In [68]:
bigram_topics = Phrases(lower_no_punctuation, min_count = 3, threshold = 8)

In [69]:
print(bigram_topics[lower_no_punctuation[0]])

['each_year', 'there', 'seems', 'to', 'be', 'more', 'added', 'to', 'the', 'teachers', 'plate', 'making', 'it', 'more', 'difficult', 'to', 'focus_on', 'teaching', 'teachers', 'seem', 'to', 'be', 'more', 'stressed', 'than_ever', 'before', 'the', 'constant', 'changing', 'of', 'testing', 'and', 'evaluations', 'have', 'certainly', 'contributed', 'to', 'this', 'stress', 'the', 'state', 'testing', 'seems', 'to', 'be', 'unfair', 'in', 'many', 'ways', 'some', 'teachers', 'have', 'never_taught', 'a', 'tested_subject', 'and', 'have', 'not', 'had', 'the', 'stress', 'that', 'the', 'tested', 'teachers', 'have', 'at', 'the', 'same', 'time', 'their', 'evaluations', 'are', 'determined', 'in', 'part', 'by', 'the', 'success', 'or', 'lack_of', 'success', 'of', 'other', 'teachers', 'which', 'is', 'unfair', 'to', 'them', 'some', 'tested_subjects', 'are', 'scheduled', 'for', 'a', 'full', 'year', 'allowing', 'sufficient', 'time', 'to', 'cover', 'all', 'of', 'the', 'state', 'standards', 'while', 'other', 'teac

In [70]:
# Creating trigrams
trigram_topics = Phrases(bigram_topics[lower_no_punctuation], min_count = 2, threshold = 8)

In [71]:
print(trigram_topics[bigram_topics[lower_no_punctuation[0]]])

['each_year', 'there_seems', 'to', 'be', 'more', 'added', 'to', 'the', 'teachers', 'plate', 'making', 'it', 'more', 'difficult', 'to', 'focus_on', 'teaching', 'teachers', 'seem', 'to', 'be', 'more', 'stressed', 'than_ever_before', 'the', 'constant_changing', 'of', 'testing', 'and', 'evaluations', 'have', 'certainly', 'contributed', 'to', 'this', 'stress', 'the', 'state_testing', 'seems', 'to', 'be', 'unfair', 'in', 'many_ways', 'some', 'teachers', 'have_never_taught', 'a_tested_subject', 'and', 'have', 'not', 'had', 'the', 'stress', 'that', 'the', 'tested', 'teachers', 'have', 'at', 'the', 'same', 'time', 'their', 'evaluations', 'are', 'determined', 'in', 'part', 'by', 'the', 'success', 'or', 'lack_of', 'success', 'of', 'other', 'teachers', 'which', 'is', 'unfair', 'to', 'them', 'some', 'tested_subjects', 'are', 'scheduled', 'for', 'a', 'full', 'year', 'allowing', 'sufficient', 'time', 'to', 'cover_all', 'of', 'the', 'state', 'standards', 'while', 'other', 'teachers', 'of', 'tested_sub

In [72]:
bigram_mod = Phraser(bigram_topics)
trigram_mod = Phraser(trigram_topics)

In [73]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [74]:
topic_trigrams = make_trigrams(lower_no_punctuation)

In [75]:
topic_trigram_no_stops = [[text for text in doc if text not in stop_words] for doc in topic_trigrams]

In [76]:
topic_lemmatized = [[wnl.lemmatize(text) for text in doc] for doc in topic_trigram_no_stops]

In [77]:
topic_lemmatized[3]

['testing_has_become',
 'driving_force',
 'education',
 'educating',
 'student',
 'have_been',
 'education',
 'year',
 'have_seen',
 'student',
 'level',
 'learning',
 'decrease',
 'please_consider',
 'mean',
 'reverse']

In [78]:
# create dictionary
id2word = corpora.Dictionary(topic_lemmatized)

In [79]:
# Create Corpus
texts = topic_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [80]:
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 2), (29, 1), (30, 2), (31, 1), (32, 1), (33, 6), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 1), (40, 2), (41, 2), (42, 1)]]


In [81]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=300,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [82]:
pprint(lda_model.print_topics())

[(0,
  '0.098*"student" + 0.033*"need" + 0.019*"test" + 0.018*"standard" + '
  '0.018*"teach" + 0.014*"testing" + 0.012*"time" + 0.010*"class" + '
  '0.009*"year" + 0.008*"get"'),
 (1,
  '0.030*"rti" + 0.027*"i_feel_like" + 0.024*"minute" + 0.016*"run" + '
  '0.015*"science" + 0.013*"social_studies" + 0.011*"group" + 0.011*"master" + '
  '0.009*"past" + 0.009*"instructional"'),
 (2,
  '0.058*"time" + 0.022*"work" + 0.017*"portfolio" + 0.017*"new" + 0.015*"day" '
  '+ 0.015*"this_year" + 0.014*"required" + 0.014*"training" + '
  '0.013*"expectation" + 0.012*"plan"'),
 (3,
  '0.039*"beneficial" + 0.022*"general_education" + 0.019*"changing" + '
  '0.016*"team_rubric" + 0.013*"business" + 0.013*"student_behavior" + '
  '0.012*"open" + 0.011*"disciplinary" + 0.010*"possibly" + 0.009*"same_way"'),
 (4,
  '0.131*"school" + 0.034*"district" + 0.019*"administration" + '
  '0.017*"support" + 0.016*"need" + 0.014*"work" + 0.014*"parent" + '
  '0.013*"help" + 0.010*"administrator" + 0.009*"often"

In [83]:
from gensim import similarities

In [84]:
lda_index = similarities.MatrixSimilarity(corpus, num_features=len(id2word))

In [85]:
# similarities = lda_index[lda_model[id2word]]

In [86]:
# vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [87]:
from gensim import models
tfidf = models.TfidfModel(corpus)

In [88]:
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x1abc16b0668>

In [89]:
import pyLDAvis.gensim

In [90]:
#lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, sort_topics=False)

In [91]:
# pyLDAvis.display(lda_display)

In [92]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def make_trigrams_mod(texts, trigram_model, bigram_model):
    return [trigram_model[bigram_model[doc]] for doc in texts]

In [93]:
# Outputs list of lists with trigrams for each answer in a list
def preprocess_responses(text_list, bigram_min_count = 3, bigram_threshold = 8, trigram_min_count = 2, trigram_threshold = 8):
    # Make all words lowercase and remove punctuation
    lower_no_punctuation = [[text for text in word_tokenize(doc.lower()) if text.isalpha()] for doc in text_list]
    # Establish stop words
    # Needs stopwords from nltk.corpus
    stop_words = stopwords.words('english')
    stop_words = stop_words + ['would', 'could', 'also', 'le', 'thank_you', 'u', 'etc', 't',
                              'the', 'and', 'are', 'of', 'for', 'that']
    # remove stop words
    text_no_stops = [[text for text in doc if text not in stop_words] for doc in lower_no_punctuation]
    # Lemmatize
    text_lemmatized = [[wnl.lemmatize(text) for text in doc] for doc in text_no_stops]
    # models for trigrams and bigrams
    bigram_topics = Phrases(text_lemmatized, min_count = bigram_min_count, threshold = bigram_threshold)
    trigram_topics = Phrases(bigram_topics[text_lemmatized], min_count = trigram_min_count, threshold = trigram_threshold)
    # Phraser for better performance
    bigram_mod = Phraser(bigram_topics)
    trigram_mod = Phraser(trigram_topics)
    # Make trigrams (also forms bigrams in the process)
    topic_trigrams = make_trigrams_mod(text_lemmatized, trigram_mod, bigram_mod)
    return topic_trigrams
    

In [94]:
def make_lda_topic_model(preprocessed_list, num_topics = 10, chunksize = 200, passes = 10):
    # create dictionary
    id2word = corpora.Dictionary(preprocessed_list)
    # Create Corpus
    texts = preprocessed_list
    corpus = [id2word.doc2bow(text) for text in texts] # Term Document Frequency
    # Created model with gensim LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=chunksize,
                                           passes=passes,
                                           alpha='auto',
                                           per_word_topics=True)
    return id2word, corpus, lda_model

In [95]:
def make_topic_visual(model, corpus, dictionary):
    lda_display = pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)
    return pyLDAvis.display(lda_display)

In [96]:
answers = preprocess_responses(open_ended_answers, 
                               bigram_min_count = 3, bigram_threshold = 8, 
                               trigram_min_count = 2, trigram_threshold = 3)

In [97]:
dictionary, corpus, lda_model = make_lda_topic_model(answers)

In [98]:
pprint(lda_model.print_topics())

[(0,
  '0.028*"child" + 0.018*"training" + 0.017*"lot" + 0.015*"plan" + '
  '0.013*"special_education" + 0.012*"pay" + 0.011*"show" + '
  '0.011*"opportunity" + 0.010*"academic" + 0.008*"team"'),
 (1,
  '0.037*"counselor" + 0.022*"school_counselor" + 0.018*"beneficial" + '
  '0.017*"mentor" + 0.016*"next_year" + 0.015*"model" + 0.012*"came" + '
  '0.012*"struggling" + 0.011*"absolutely" + 0.011*"need_support"'),
 (2,
  '0.034*"truly" + 0.023*"career" + 0.021*"certain" + 0.015*"outside" + '
  '0.015*"within_school" + 0.014*"social_study" + 0.012*"supported" + '
  '0.012*"art" + 0.011*"considered" + 0.011*"pulled"'),
 (3,
  '0.041*"portfolio" + 0.024*"lesson" + 0.020*"back" + 0.019*"hour" + '
  '0.017*"planning_time" + 0.011*"book" + 0.010*"amazing" + '
  '0.010*"love_school" + 0.009*"run" + 0.009*"nice"'),
 (4,
  '0.052*"county" + 0.040*"fair" + 0.038*"pd" + 0.034*"another" + '
  '0.022*"completely" + 0.018*"reflect" + 0.014*"whether" + 0.013*"cause" + '
  '0.012*"rule" + 0.011*"conside

In [99]:
# make_topic_visual(lda_model, corpus, dictionary)

In [100]:
def total_lda_vis(response_list, 
                  bigram_min_count = 3, bigram_threshold = 8, 
                  trigram_min_count = 2, trigram_threshold = 3,
                  num_topics = 7, topic_chunksize = 200, passes = 10
                 ):
    trigram_list = preprocess_responses(response_list, 
                                        bigram_min_count = bigram_min_count, bigram_threshold = bigram_threshold, 
                                        trigram_min_count = trigram_min_count, trigram_threshold = trigram_threshold)
    dictionary, corpus, lda_model = make_lda_topic_model(trigram_list, 
                                                         num_topics = num_topics, chunksize = topic_chunksize, passes = passes)
    return make_topic_visual(lda_model, corpus, dictionary)

In [101]:
# total_lda_vis(open_ended_answers, 
#                   bigram_min_count = 3, bigram_threshold = 3, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 6, topic_chunksize = 300, passes = 8
#                  )

## Compare Different groups in survey

In [102]:
# df_selected_columns['Role_Compass'].value_counts()

In [103]:
def teacher_admin(row):
    if row['bestguess_tch'] == 1:
        return 'Teacher'
    else:
        return 'Admin'

In [104]:
df_open_answered.loc[:,'teacher_admin'] = df_open_answered.apply(teacher_admin, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)


In [105]:
df_open_answered.head()

Unnamed: 0,tchlic,district_no,district_name,school_no,school_name,Gender,YrsExpr18,EdLevel18,Role_Compass,bestguess_tch,...,StartTime,EndTime,Responded,Q13,answered_q13,total_surveys_sent,total_number_responses,total_answered_q13,response_character_length,teacher_admin
5,34347,180,Cumberland County,25,Cumberland County High School,F,52.0,MA,Teacher,1,...,2018-03-06 19:14:00,2018-03-06 20:09:00,1,Each year there seems to be more added to the ...,1,71636,40876,12175,766,Teacher
7,36435,320,Hamblen County,35,Whitesburg Elementary,M,53.0,MA+,Principal,0,...,2018-03-06 08:10:00,2018-03-06 08:53:00,1,Give TCAP test in all grades. Give pre- and po...,1,71636,40876,12175,448,Admin
17,43604,70,Campbell County,75,Jellico High School,F,51.0,BA,Teacher,1,...,2018-03-08 13:23:00,2018-03-08 14:03:00,1,Testing has replaced learning and teaching. It...,1,71636,40876,12175,93,Teacher
27,48309,630,Montgomery County,19,Kenwood High,M,39.0,MA+,Teacher,1,...,2018-03-06 05:53:00,2018-03-20 06:09:00,1,Testing has become the driving force of educat...,1,71636,40876,12175,225,Teacher
30,49926,80,Cannon County,30,Short Mountain Elementary,M,50.0,MA,Principal,0,...,2018-03-13 09:53:00,2018-03-13 10:51:00,1,The principal evaluation is too wordy and comp...,1,71636,40876,12175,244,Admin


In [106]:
df_open_answered.teacher_admin.value_counts()

Teacher    11652
Admin        523
Name: teacher_admin, dtype: int64

In [107]:
teacher_answered_df = df_open_answered.loc[(df_open_answered['teacher_admin'] == 'Teacher'), :]

In [108]:
admin_answered_df = df_open_answered.loc[(df_open_answered['teacher_admin'] == 'Admin'), :]

In [109]:
print('Number of Teachers answering Q13: ' + str(teacher_answered_df.shape[0]))

Number of Teachers answering Q13: 11652


In [110]:
print('Number of Admins answering Q13: ' + str(admin_answered_df.shape[0]))

Number of Admins answering Q13: 523


In [111]:
teacher_comment_list = teacher_answered_df.Q13.tolist()

In [112]:
admin_comment_list = admin_answered_df.Q13.tolist()

In [113]:
admin_comment_list[:3]

['Give TCAP test in all grades. Give pre- and post in k. Give post in every other grade. This would give a more accurate growth score for all teachers. If growth scores are important then give teachers their OWN SCORES. Giving teachers growth from other grade levels cannot be accurate. Growth scores count 35%of teacher evaluation.If teachers have their  own scores they will work harder and smarter. Portfolios cannot be accurate for growth scores.',
 'The principal evaluation is too wordy and complex.  There\'s too much "evidence" required. We spend too much time documenting some indicator (for example, 58 pieces of evidence for Standard A). We don\'t have enough local training on the process.',
 'If schools are to be graded on state standards then why is there not a state-wide curriculum that teachers are expected to teach instead of having 135 + school systems design their own  plus their own pacing guides.  The changes that have taken place over the last four years in testing at the 

## Admin Topic Model

In [114]:
# total_lda_vis(admin_comment_list, 
#                   bigram_min_count = 3, bigram_threshold = 8, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 75, passes = 8
#                  )

## Teacher Topic Model

In [115]:
# total_lda_vis(teacher_comment_list, 
#                   bigram_min_count = 3, bigram_threshold = 8, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 75, passes = 8
#                  )

## Textblob Sentiment Analysis

In [116]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [117]:
blob_list = [TextBlob(doc) for doc in open_ended_answers]

In [118]:
answer_sentiments = [blob.sentiment.polarity for blob in blob_list]

In [119]:
answer_sentiments[:5]

[0.06931216931216931, 0.29500000000000004, 0.0, 0.0, 0.016666666666666673]

In [120]:
print('Average sentiment: ' + str(np.mean(answer_sentiments)))

Average sentiment: 0.13773875368173114


The average sentiment seems to indicate that the reviews are positive overall

In [121]:
sent_category = ['positive' if sent > 0 else 'neutral' if sent == 0 else 'negative' for sent in answer_sentiments]

In [122]:
len(sent_category)

12175

In [123]:
neg_inds = [True if sent < 0 else False for sent in answer_sentiments]

In [124]:
from itertools import compress

In [125]:
# Reviews marked as negative
negative_comment_list = list(compress(open_ended_answers, neg_inds))

In [126]:
textblob_sentiments_df = pd.DataFrame({'sentiment_polarity': answer_sentiments,
                                      'sentiment_category': sent_category})

In [127]:
textblob_total_df =  pd.concat([df_open_answered.reset_index(drop=True), textblob_sentiments_df], axis=1)

In [128]:
textblob_total_df.head()

Unnamed: 0,tchlic,district_no,district_name,school_no,school_name,Gender,YrsExpr18,EdLevel18,Role_Compass,bestguess_tch,...,Responded,Q13,answered_q13,total_surveys_sent,total_number_responses,total_answered_q13,response_character_length,teacher_admin,sentiment_polarity,sentiment_category
0,34347,180,Cumberland County,25,Cumberland County High School,F,52.0,MA,Teacher,1,...,1,Each year there seems to be more added to the ...,1,71636,40876,12175,766,Teacher,0.069312,positive
1,36435,320,Hamblen County,35,Whitesburg Elementary,M,53.0,MA+,Principal,0,...,1,Give TCAP test in all grades. Give pre- and po...,1,71636,40876,12175,448,Admin,0.295,positive
2,43604,70,Campbell County,75,Jellico High School,F,51.0,BA,Teacher,1,...,1,Testing has replaced learning and teaching. It...,1,71636,40876,12175,93,Teacher,0.0,neutral
3,48309,630,Montgomery County,19,Kenwood High,M,39.0,MA+,Teacher,1,...,1,Testing has become the driving force of educat...,1,71636,40876,12175,225,Teacher,0.0,neutral
4,49926,80,Cannon County,30,Short Mountain Elementary,M,50.0,MA,Principal,0,...,1,The principal evaluation is too wordy and comp...,1,71636,40876,12175,244,Admin,0.016667,positive


In [129]:
textblob_total_df[['teacher_admin', 'sentiment_category', 'tchlic']].groupby(['teacher_admin', 'sentiment_category']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,tchlic
teacher_admin,sentiment_category,Unnamed: 2_level_1
Admin,negative,72
Admin,neutral,38
Admin,positive,413
Teacher,negative,2154
Teacher,neutral,763
Teacher,positive,8735


In [130]:
textblob_total_df[['teacher_admin', 'sentiment_category', 'response_character_length']]\
                .groupby(['teacher_admin', 'sentiment_category']).mean().round(2)


Unnamed: 0_level_0,Unnamed: 1_level_0,response_character_length
teacher_admin,sentiment_category,Unnamed: 2_level_1
Admin,negative,403.85
Admin,neutral,139.29
Admin,positive,498.52
Teacher,negative,444.12
Teacher,neutral,146.25
Teacher,positive,548.02


- Neutral comments are much shorter on average than positive or negative


In [131]:
district_grouping = textblob_total_df[['district_no', 'district_name', 'sentiment_category', 'tchlic']]\
                            .groupby(['district_no', 'district_name', 'sentiment_category']).count()
district_grouping.columns = ['count']
district_grouping_pct = district_grouping.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).round(2)
district_grouping_pct.columns = ['pct']

In [132]:
district_grouping.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
district_no,district_name,sentiment_category,Unnamed: 3_level_1
10,Anderson County,negative,14
10,Anderson County,neutral,5
10,Anderson County,positive,72
11,Clinton City,negative,5
11,Clinton City,positive,24
12,Oak Ridge City,negative,7
12,Oak Ridge City,neutral,5
12,Oak Ridge City,positive,53
20,Bedford County,negative,24
20,Bedford County,neutral,9


In [133]:
district_grouping_pct.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pct
district_no,district_name,sentiment_category,Unnamed: 3_level_1
10,Anderson County,negative,15.38
10,Anderson County,neutral,5.49
10,Anderson County,positive,79.12
11,Clinton City,negative,17.24
11,Clinton City,positive,82.76
12,Oak Ridge City,negative,10.77
12,Oak Ridge City,neutral,7.69
12,Oak Ridge City,positive,81.54
20,Bedford County,negative,19.2
20,Bedford County,neutral,7.2


# Training classifier to see if the sentiments change

In [134]:
open_ended_answers[90:100]

['The amount of time for testing to reach the actual student testing day for TCAP is absurd and harmful to all involved. Data never arrives that can be effectively used.  Students and parents see the state testing as ridiculous and an unnecessary stressor to children.   A better system must be developed soon or the state of public education will collapse as funding for testing seems to precede funding for special education needs and improving school infrastructure.   We are losing the children in public education and democracy is failing.',
 'Instructional coaching is new to our district this year and new to me and our teachers. I am working a 100 day contract. We have worked hard to define and communicate the role of the instructional coach and have made some progress.  The NW Tennessee CORE office has organized a Coaching Collaborative.  Sessions have been helpful on curricular issues but I would like to see more focus on coaching models and the challenge of working with adult learne

## Topics for constructive compared with topics for destructive

In [162]:
const_comments = textblob_df_with_class.loc[textblob_df_with_class['classification'] == 'constructive', 'Q13'].tolist()

In [163]:
len(const_comments)

7094

In [164]:
dest_comments = textblob_df_with_class.loc[textblob_df_with_class['classification'] == 'destructive', 'Q13'].tolist()

In [165]:
len(dest_comments)

3971

In [166]:
# # Topics for comments labeled as Constructive
# total_lda_vis(const_comments, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 6, topic_chunksize = 100, passes = 8
#                  )

In [167]:
# # Topics for comments labeled as Destructive
# total_lda_vis(dest_comments, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 100, passes = 8
#                  )

## Notes
- Similar topics between the categories
- Portfolio occurs much more frequently in comments labeled as destructive
- Professional Development and Curriculum show up more in constructive comments
    - Request for state wide curriculum
    - Desire additional PD opportunities
- RTI appears in the constructive comments

## Constructive/destructive by district and school

### By District

In [168]:
district_class_grouping = textblob_df_with_class[['district_no', 'district_name', 'classification', 'tchlic']]\
                            .groupby(['district_no', 'district_name', 'classification']).count()
district_class_grouping.columns = ['count']
district_class_grouping_pct = district_class_grouping.groupby(level=0).apply(lambda x:
                                                 100 * x / float(x.sum())).round(2)
district_class_grouping_pct.columns = ['pct']
district_class_grouping_pct = district_class_grouping_pct.reset_index()

In [169]:
# district_class_grouping

In [170]:
district_class_grouping_pct[district_class_grouping_pct['classification'] == 'destructive']\
        .sort_values(by = ['classification', 'pct'], ascending = False)

Unnamed: 0,district_no,district_name,classification,pct
130,274,Bradford,destructive,75.00
345,810,Stewart County,destructive,55.56
166,371,Rogersville City,destructive,53.85
271,660,Obion County,destructive,52.50
50,97,West Carroll SSD,destructive,50.00
231,542,Etowah,destructive,50.00
183,410,Hickman County,destructive,48.98
277,670,Overton County,destructive,47.50
171,390,Henderson County,destructive,47.17
75,160,Coffee County,destructive,47.14


In [171]:
textblob_df_with_class.loc[textblob_df_with_class['district_no'] == 690, 'Q13'].tolist()

['RTI seems to be perhaps a waste of time. Salary increases need to show up in teacher paychecks.',
 'RTI needs less paperwork, more supply funds, and more teacher training.',
 'I think we need to get back to the basics in education.  Money is wasted on testing, testing, and more testing. Many people that make decisions that relate to classroom teaching have never even taught in a classroom. I feel instead of helping education all of the changes are actually hurting our public school systems. Teacher morale is lower, and student morale is lower.   It seems to me that things change about every 4-5 years in our education system, but instead of helping the changes are only hurting.  We need to make sure our students can read, write, and have work ethics but also create an atmosphere that they enjoy coming to learn instead of focusing on just being TEST ready!!',
 'RTI at the high school level is not effective. They are often the ones who lack motivation. They are embarrassed by the proces

In [172]:
district_class_grouping_pct[district_class_grouping_pct['classification'] == 'constructive']\
        .sort_values(by = ['classification', 'pct'], ascending = False)

Unnamed: 0,district_no,district_name,classification,pct
39,90,Carroll County,constructive,100.00
48,95,South Carroll SSD,constructive,100.00
203,480,Lake County,constructive,87.50
154,340,Hancock County,constructive,85.71
89,172,Bells City,constructive,83.33
244,581,Richard City,constructive,77.78
404,960,West Tennessee School for the Deaf,constructive,75.00
238,570,Madison County,constructive,73.58
279,680,Perry County,constructive,72.22
55,101,Elizabethton City,constructive,71.43


## Mallet

In [173]:
import os
from gensim.models.wrappers import LdaMallet

os.environ['MALLET_HOME'] = 'C:\\Users\\ca20593\\mallet'

In [174]:
from gensim.models import CoherenceModel

In [175]:
mallet_path = 'C:/Users/ca20593/mallet/mallet-2.0.8/bin/' # update this path
# dictionary, corpus, lda_model
#ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary)

In [176]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=dictionary)

CalledProcessError: Command 'C:/Users/ca20593/mallet/mallet-2.0.8/bin/ import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input C:\Users\ca20593\AppData\Local\Temp\783f14_corpus.txt --output C:\Users\ca20593\AppData\Local\Temp\783f14_corpus.mallet' returned non-zero exit status 1.

In [None]:
pprint(ldamallet.show_topics(formatted=False))

In [None]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=answers, dictionary=dictionary, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', round(coherence_ldamallet, 2))

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, 
                                                        texts=answers, start=5, limit=20, step=1)



In [None]:
# Show graph
limit=20; start=5; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
#plt.show()


In [None]:
coherence_values

## Years of experience comparison

In [177]:
def over_years_exp(yrs_exp):
    return textblob_df_with_class.loc[textblob_df_with_class['YrsExpr18'] >= yrs_exp, 'Q13'].tolist()
    
def under_years_exp(yrs_exp):
    return textblob_df_with_class.loc[textblob_df_with_class['YrsExpr18'] <= yrs_exp, 'Q13'].tolist()

In [178]:
over_25_answers = textblob_df_with_class.loc[textblob_df_with_class['YrsExpr18'] >= 25, 'Q13'].tolist()

In [179]:
len(over_25_answers)

1612

In [180]:
# total_lda_vis(over_25_answers, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 100, passes = 8
#                  )

In [181]:
under_5_years = under_years_exp(5)

In [182]:
len(under_5_years)

3317

In [183]:
# total_lda_vis(under_5_years, 
#                   bigram_min_count = 3, bigram_threshold = 7, 
#                   trigram_min_count = 2, trigram_threshold = 3,
#                   num_topics = 5, topic_chunksize = 100, passes = 8
#                  )

## Sci Kit Learn TF-IDF

In [184]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.pipeline import make_pipeline

In [185]:
tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_df = 0.85)

In [186]:
responses = tfidf.fit_transform(open_ended_answers)

  return matrix(data, dtype=dtype, copy=False)


In [187]:
words = tfidf.get_feature_names()

In [188]:
model = NMF(n_components=6)

In [189]:
model.fit(responses)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=6, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [190]:
nmf_features = model.transform(responses)

In [191]:
df_features = pd.DataFrame(nmf_features)

In [192]:
df_features.head()

Unnamed: 0,0,1,2,3,4,5
0,4.2e-05,0.037974,0.023696,0.049943,0.025681,0.031519
1,0.0,0.04095,0.0,0.001481,0.074911,0.013166
2,0.0,0.050798,0.0,0.0,0.002171,0.067886
3,0.0,0.028419,0.0,0.0,0.0,0.039809
4,0.0,0.006281,0.039661,0.0,0.042445,0.0


In [193]:
df_features.shape

(12175, 6)

In [194]:
components_df = pd.DataFrame(model.components_, columns = words)

In [195]:
components_df.shape

(6, 14581)

In [196]:
component = components_df.iloc[2,:]
print(component.nlargest(10))

time             2.266682
planning         1.054926
much             0.521203
day              0.387277
meetings         0.371132
away             0.340419
spend            0.334770
plan             0.328954
spent            0.304794
instructional    0.300805
Name: 2, dtype: float64


In [197]:
norm_features = normalize(nmf_features)

In [198]:
norm_df = pd.DataFrame(norm_features)

In [199]:
norm_df.head()

Unnamed: 0,0,1,2,3,4,5
0,0.000541,0.4842,0.302146,0.636811,0.327444,0.401883
1,0.0,0.473983,0.0,0.017139,0.867077,0.152396
2,0.0,0.598923,0.0,0.0,0.025599,0.800398
3,0.0,0.581017,0.0,0.0,0.0,0.813891
4,0.0,0.107491,0.678786,0.0,0.726427,0.0


In [200]:
norm_df.shape

(12175, 6)

In [201]:
rand_response = norm_df.iloc[8000]

In [202]:
similarities = norm_df.dot(rand_response)

In [203]:
similarities.sort_values(ascending = False).head(11)

8000     1.000000
11466    0.999542
11167    0.998364
10764    0.998089
3433     0.998007
7984     0.997892
10744    0.997710
8922     0.997561
9106     0.995727
5299     0.995473
6819     0.994592
dtype: float64

In [204]:
similarities.sort_values(ascending = False).head(11).index.tolist()[1:]

[11466, 11167, 10764, 3433, 7984, 10744, 8922, 9106, 5299, 6819]

In [205]:
print(similarities.nlargest())

8000     1.000000
11466    0.999542
11167    0.998364
10764    0.998089
3433     0.998007
dtype: float64


In [206]:
open_ended_answers[8000]

'The "PDP" and "PLC" times required by the schools in our district seems to vary from school to school, and from district to district. It also remains unclear which activities fall under which heading. I need clear, concise guidelines for what is required. I find them to be a confusing mess.'

In [207]:
open_ended_answers[11466]

'For our roles as school counselors, at each level, to be more defined and concrete. In addition to doing completely away with the term "guidance".'

In [208]:
def find_similar_responses(initial_ind,  answer_list, num_similar = 10, num_topics = 6):
    tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_df = 0.80)
    responses = tfidf.fit_transform(answer_list)
    model = NMF(n_components=num_topics)
    nmf_features = model.fit_transform(responses)
    norm_features = normalize(nmf_features)
    normalized_df = pd.DataFrame(norm_features)
    response = normalized_df.iloc[initial_ind]
    similar = normalized_df.dot(response)
    similar_ind_list = similar.sort_values(ascending = False).index.tolist()[0:num_similar+1]
    return [answer_list[ind] for ind in similar_ind_list]

In [209]:
find_similar_responses(4657, open_ended_answers, num_similar = 3)

  return matrix(data, dtype=dtype, copy=False)


['Teacher evaluations should not be linked to standardized testing.',
 'TOO MUCH TESTING!!! TCAP testing this year is spread over a three week period and scores are going to be terrible. The students are tired of testing as it is and will shut down. I do not feel that this will be a fair reflection of my teaching but I will be held responsible as it will reflect in my growth score. Please do something to change the testing process.',
 'The current way we conduct EOC testing is seriously flawed. If you want to test student Growth then you need a pre and post test. Administer a test in September and another one at the end of the year. Compare those scores. That will show true growth.',
 'State testing is not effective to either the teacher nor the student.']

In [210]:
def show_topic_words(text_list, num_topics, num_words = 10):
    tfidf = TfidfVectorizer(stop_words=stopwords.words('english'), max_df = 0.80, ngram_range = (1,1))
    responses = tfidf.fit_transform(text_list)
    words = tfidf.get_feature_names()
    model = NMF(n_components=num_topics)
    nmf_features = model.fit_transform(responses)
    norm_features = normalize(nmf_features)
    components_df = pd.DataFrame(model.components_, columns = words)
    for i in range(num_topics):
        component = components_df.iloc[i,:]
        print(component.nlargest(num_words))
        print('\n')
    normalized_df = pd.DataFrame(norm_features)
    return normalized_df

In [211]:
normalized_df = show_topic_words(open_ended_answers, 7, num_words = 10)

  return matrix(data, dtype=dtype, copy=False)


school            2.583833
district          0.703513
administration    0.635292
principal         0.602883
feel              0.543723
staff             0.529662
would             0.528359
like              0.490054
work              0.485041
year              0.476969
Name: 0, dtype: float64


testing         1.625615
test            1.357972
state           0.710178
tests           0.641387
much            0.617745
scores          0.460736
standardized    0.451074
year            0.405998
students        0.389972
act             0.318562
Name: 1, dtype: float64


time             2.325598
planning         1.128552
much             0.516198
day              0.414653
meetings         0.402015
spend            0.349627
plan             0.347304
away             0.315902
instructional    0.301159
spent            0.297191
Name: 2, dtype: float64


standards      1.868959
teach          0.636659
grade          0.611109
social         0.457163
studies        0.447330
math           0.37055

In [212]:
topic_2_list = norm_df.sort_values(by = 2, ascending = False).index.tolist()[:30]

In [213]:
[open_ended_answers[i] for i in topic_2_list]

['The schedule shows adequate planning time, but because of meetings, PLC, data meetings, IEP meetings, parent meetings, etc. we have very little actual planning time.',
 'Planning time should be planning time. IEP Meetings along with other meetings should not occur during this time.',
 'More money, more planning time, money for instructional supplies',
 'RTI takes up a lot of my time.  It could be more beneficial for me to spend more time on actual coaching.',
 'Too much time spent on paperwork, planning, lesson plans and not enough time working on objectives.',
 'I think that planning time should be set aside as just that "planning time." Teachers use planning time for meetings, to make copies, etc. and the true lesson planning is done on our own time.',
 'I think instructional time should be protected. I have a difficult time with missed instructional time.',
 'TIme is a big issue.  RTI takes away from my curriculum.  Planning time is  with my lunch time so it is sufficient.',
 'Com

In [214]:
topic_0_list = norm_df.sort_values(by = 5, ascending = False).index.tolist()[:30]

In [215]:
[open_ended_answers[i] for i in topic_0_list]

['I think Tennessee is beginning to move in the right direction by raising the expectations for all students.  I am concerned about Special Education students only having two paths  1) regular ed diploma 2) special ed diploma.  Many of our sped students need a third option that falls in between.',
 'Class sizes should not be higher in special education classroom than regular education classrooms.',
 'There is almost no parental support for the students or teachers.  My parents or students do not appreciate the value of education.',
 'We need to be allowed to discipline students regardless of their background to discourage acting out and being disrespectful.',
 "I'm not sure current inclusion practices of including certain special education and ELL students benefit anyone, especially the students.  Vocational classes need to be increased since not everyone will be attending college.",
 'Parents should be held as accountable for their children as the teachers are.',
 'Improved pay scales

In [216]:
def answers_from_topic(answer_list,normalized_df, topic_num, answer_count = 10, topic_prop = 1):
    df_topics = normalized_df.loc[normalized_df.max(axis = 1) < topic_prop]
    topic_list = df_topics.sort_values(by = topic_num, ascending = False).index.tolist()[:answer_count]
    return [answer_list[i] for i in topic_list]

# Widget Testing

In [217]:
from ipywidgets import Button, Layout, Box, interactive, fixed, HBox, Label

In [218]:
import ipywidgets as widgets

In [219]:
b = Button(description='(50% width, 80px height) button',
           layout=Layout(width='50%', height='80px'))
b


Button(description='(50% width, 80px height) button', layout=Layout(height='80px', width='50%'), style=ButtonS…

In [220]:
w = widgets.IntSlider()
display(w)

IntSlider(value=0)

In [221]:
w.value

0

In [222]:
widgets.Dropdown(
    options=range(20),
    value=2,
    description='Number:',
    disabled=False,
)

Dropdown(description='Number:', index=2, options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…

In [223]:
# show_topic_words(text_list, num_topics, num_words = 10)
# show_topic_words(open_ended_answers, 7, num_words = 10)
y = interactive(show_topic_words, text_list = fixed(open_ended_answers),
                num_topics = range(1,21), num_words = range(5,16))

In [224]:
display(y)

interactive(children=(Dropdown(description='num_topics', options=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…

In [230]:
norm_df = show_topic_words(open_ended_answers, y.children[0].value, num_words = y.children[1].value);

  return matrix(data, dtype=dtype, copy=False)


school            2.585341
district          0.703922
administration    0.635661
principal         0.603234
feel              0.544036
staff             0.529971
would             0.528667
like              0.490338
work              0.485320
year              0.477245
love              0.461502
schools           0.448626
great             0.417684
Name: 0, dtype: float64


testing         1.625954
test            1.358256
state           0.710326
tests           0.641523
much            0.617874
scores          0.460833
standardized    0.451168
year            0.406082
students        0.390071
act             0.318629
student         0.257939
results         0.247611
end             0.246816
Name: 1, dtype: float64


time             2.329260
planning         1.130328
much             0.517011
day              0.415305
meetings         0.402648
spend            0.350178
plan             0.347850
away             0.316399
instructional    0.301633
spent            0.297659
enough      

In [231]:
y.children[0].value

7

In [227]:
def answers_from_topic_widget(answer_list,normalized_df, topic_num, answer_count = 10, topic_prop = 1):
    # df_topics = normalized_df.loc[normalized_df.max(axis = 1) < topic_prop]
    df_topics = normalized_df.loc[normalized_df.iloc[:,topic_num] < topic_prop]
    topic_list = df_topics.sort_values(by = topic_num, ascending = False).index.tolist()[:answer_count]
    display( [answer_list[i] for i in topic_list] )

In [228]:
# answers_from_topic(answer_list,normalized_df, topic_num, answer_count = 10, topic_prop = 1)
z = interactive(answers_from_topic_widget,normalized_df = fixed(norm_df), answer_list = fixed(open_ended_answers),
                topic_num = range(0,y.children[0].value), answer_count = range(2,21), 
                topic_prop = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1])

In [229]:
display(z)
# z.result

interactive(children=(Dropdown(description='topic_num', options=(0, 1, 2, 3, 4, 5, 6), value=0), Dropdown(desc…