In [1]:
import pandas as pd
import numpy as np
import altair as alt
from preprocessing import *
import spacy
nlp = spacy.load("en_core_web_sm")
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [2]:
# load pre survey data for 2021 
post_df = pd.read_csv("../data/processed/post_survey.csv", sep=",", encoding='cp1252')

In [3]:
post_df.head()

Unnamed: 0,ID,Start time,Completion time,Email,Name,workshop_interest,workshop_LOU,workshop_relevance,workshop_org,prac_q,...,facilitator_q_a,facilitator_experience,facilitator_subject_knowledge,attention,content_pace,facilitator_Comments,slack_recommendations,subscription,general_comments,PD_preferred_time
0,1,4-28-22 11:02:00,4-28-22 11:04:46,anonymous,,Agree,Agree,Agree,Agree,Agree,...,Agree,Agree,Agree,Agree,Just right,,It was fine / good,"Yes, I will subscribe via PMV's website to rec...",Thank you and it was a pleasure knowing you al...,Weekdays (Monday to Friday)\n
1,2,4-28-22 11:01:44,4-28-22 11:05:26,anonymous,,Strongly agree,Strongly agree,Agree,Strongly agree,Agree,...,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Just right,Awesome!,I honestly did use Slack I used the chat and t...,"Maybe, I will think about it;",,Weekdays (Monday to Friday)\n
2,3,4-28-22 11:01:41,4-28-22 11:07:05,anonymous,,Agree,Agree,Agree,Agree,Agree,...,Agree,Agree,Agree,Agree,Just right,nothing :),I didn't use slack a lot because our organizat...,"Maybe, I will think about it;Yes, I will subsc...",,Weekdays (Monday to Friday)\n
3,4,4-28-22 11:01:46,4-28-22 11:07:08,anonymous,,Agree,Agree,Agree,Neutral,Agree,...,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Just right,can't think of anything right now,I didn't have time to explore slack,"No, not at this time;",thanks again!,Weekdays (Monday to Friday)\n
4,5,4-28-22 11:02:10,4-28-22 11:10:02,anonymous,,Strongly agree,Agree,Agree,Strongly agree,Agree,...,Strongly Agree,Strongly Agree,Strongly Agree,Strongly Agree,Just right,I enjoyed the sense of camaraderie that was cr...,I did not like and have never used SLACK plus ...,"Maybe, I will think about it;",I enjoyed the sense of camaraderie that was cr...,Weekdays (Monday to Friday)\n


In [4]:
# 8
post_df.improvements_content.unique()

array([nan, 'Maybe some fun graphics', 'I don;t have any reccomendations',
       'it seems like most of the workshop participants work for larger organizations - not sure if that is the case - but I had a bit of a hard time applying some of the ideas to my workplace - there are only 2 part time employees here. But I will definitely take away the broader ideas and try to apply them.',
       'None at this time...',
       'Hands on using Project Management Software',
       'I noticed that the slides that were provided were not exactly what David was presenting, new info was added after. It would be nice to have the updated slides to follow on, for those who make notes. ',
       'Could be helpful to create documentation (Project Charter, WBS, Gantt Chart etc.) based on a sample project, either in break out rooms or together in the larger group.',
       'not sure if the workshop materials can be sent via Zoom - only need to log into Zoom to access the workshop and materials instead of

In [8]:
q8_df = pd.DataFrame()
q8_df = post_df[["improvements_content"]]

q8_df = q8_df.dropna(subset=['improvements_content'])
q8_df["pp_text"] = q8_df["improvements_content"].apply(preprocess)

q8_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q8_df["pp_text"])
]
q8_df

Unnamed: 0,improvements_content,pp_text
1,Maybe some fun graphics,fun graphic
2,I don;t have any reccomendations,reccomendation
3,it seems like most of the workshop participant...,like workshop participant work large organizat...
4,None at this time...,time
5,Hands on using Project Management Software,hand project management software
6,I noticed that the slides that were provided w...,notice slide provide david present new info ad...
7,Could be helpful to create documentation (Proj...,helpful create documentation project charter w...
8,not sure if the workshop materials can be sent...,sure workshop material send zoom need log zoom...
9,Would be great to have the chance to mock plan...,great chance mock plan event begin end
10,More break out sessions probably.,break session


In [9]:
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
preprocessed_corpus = [doc.split() for doc in q8_df["pp_text"].tolist()]
dictionary = corpora.Dictionary(preprocessed_corpus)
# Optional
# dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=100000)
len(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])


from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())
    
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

Unnamed: 0,Coherence score
3,0.368943
4,0.584488
5,0.596475
6,0.558417
7,0.649528
8,0.588762


In [10]:
lda7 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=5,
    random_state=42,
    passes=17,
)
topic_table_8 = pd.DataFrame(lda7.print_topics(), columns=["Topic id", "Topic words"])
topic_table_8

Unnamed: 0,Topic id,Topic words
0,0,"0.013*""think"" + 0.013*""dynamic"" + 0.013*""creat..."
1,1,"0.059*""think"" + 0.032*""graphic"" + 0.032*""fun"" ..."
2,2,"0.056*""slide"" + 0.031*""notice"" + 0.031*""presen..."
3,3,"0.060*""workshop"" + 0.060*""zoom"" + 0.060*""time""..."
4,4,"0.089*""project"" + 0.033*""break"" + 0.033*""large..."


In [11]:
lda7.print_topics()

[(0,
  '0.013*"think" + 0.013*"dynamic" + 0.013*"create" + 0.013*"example" + 0.013*"activity" + 0.013*"discussion" + 0.013*"mean" + 0.013*"breakout" + 0.013*"fine" + 0.013*"room"'),
 (1,
  '0.059*"think" + 0.032*"graphic" + 0.032*"fun" + 0.032*"work" + 0.032*"content" + 0.032*"you" + 0.032*"room" + 0.032*"fill" + 0.032*"curse" + 0.032*"template"'),
 (2,
  '0.056*"slide" + 0.031*"notice" + 0.031*"present" + 0.031*"follow" + 0.031*"add" + 0.031*"new" + 0.031*"david" + 0.031*"nice" + 0.031*"info" + 0.031*"provide"'),
 (3,
  '0.060*"workshop" + 0.060*"zoom" + 0.060*"time" + 0.042*"idea" + 0.042*"apply" + 0.042*"sure" + 0.042*"material" + 0.042*"hard" + 0.023*"case" + 0.023*"organization"'),
 (4,
  '0.089*"project" + 0.033*"break" + 0.033*"large" + 0.033*"etc" + 0.033*"sample" + 0.033*"charter" + 0.033*"chart" + 0.033*"base" + 0.033*"documentation" + 0.033*"gantt"')]

In [12]:
topic_table_8['Topics']=['Dynamic activities in break out rooms, use project management software','add fun graphics','need to update slides to follow through', 
                         'send materials via zoom instead of slack', 'create one project together to work through as a group, have discussion templates']

In [13]:
topic_table_8 = topic_table_8.drop(columns=['Topic words', 'Topic id'])
topic_table_8
topic_table_8.to_csv('../results/tables/LDA_8.csv', index=False)

Unnamed: 0,Topics
0,"Dynamic activities in break out rooms, use pro..."
1,add fun graphics
2,need to update slides to follow through
3,send materials via zoom instead of slack
4,create one project together to work through as...


In [14]:
topic_table_8.to_csv('../results/tables/LDA_8.csv', index=False)

In [45]:
#11
post_df.comments_content.unique()

array([nan, 'Great presenters that made sure you  understood',
       "can't think of anything right now",
       'I would recommend this workshop and your team to others...',
       'Everything is simple and very easy to understand',
       'offer a refresher course for those who have taken it in the past. ',
       'Nil',
       'The content was useful, it was great to get all the templates that we can use for our next project. \n',
       'I think it is good, I di not have any trouble having it online. '],
      dtype=object)

In [46]:
q9_df = pd.DataFrame()
q9_df = post_df[["comments_content"]]

q9_df = q9_df.dropna(subset=['comments_content'])
q9_df["pp_text"] = q9_df["comments_content"].apply(preprocess)

q9_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q9_df["pp_text"])
]
q9_df

Unnamed: 0,comments_content,pp_text
1,Great presenters that made sure you understood,great presenter sure understand
3,can't think of anything right now,think
4,I would recommend this workshop and your team ...,recommend workshop team
5,Everything is simple and very easy to understand,simple easy understand
6,offer a refresher course for those who have ta...,offer refresher course take past
8,Nil,nil
9,"The content was useful, it was great to get al...",content useful great template use project
11,"I think it is good, I di not have any trouble ...",think good trouble have online


In [47]:
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
preprocessed_corpus = [doc.split() for doc in q9_df["pp_text"].tolist()]
dictionary = corpora.Dictionary(preprocessed_corpus)
# Optional
# dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=100000)
len(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])


from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())
    
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

Unnamed: 0,Coherence score
3,0.576008
4,0.583729
5,0.591333
6,0.601251
7,0.608398
8,0.612296


In [51]:
lda4 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=4,
    random_state=42,
    passes=17,
)
topic_table_9 = pd.DataFrame(lda4.print_topics(), columns=["Topic id", "Topic words"])
topic_table_9

Unnamed: 0,Topic id,Topic words
0,0,"0.082*""past"" + 0.082*""offer"" + 0.082*""refreshe..."
1,1,"0.095*""great"" + 0.094*""content"" + 0.094*""usefu..."
2,2,"0.123*""understand"" + 0.070*""think"" + 0.068*""ha..."
3,3,"0.040*""think"" + 0.040*""nil"" + 0.040*""great"" + ..."


In [53]:
lda4.print_topics()

[(0,
  '0.082*"past" + 0.082*"offer" + 0.082*"refresher" + 0.082*"course" + 0.082*"take" + 0.082*"workshop" + 0.082*"recommend" + 0.082*"team" + 0.081*"think" + 0.016*"nil"'),
 (1,
  '0.095*"great" + 0.094*"content" + 0.094*"useful" + 0.094*"project" + 0.094*"use" + 0.094*"template" + 0.094*"nil" + 0.019*"think" + 0.019*"simple" + 0.019*"easy"'),
 (2,
  '0.123*"understand" + 0.070*"think" + 0.068*"have" + 0.068*"online" + 0.068*"trouble" + 0.068*"good" + 0.068*"sure" + 0.068*"presenter" + 0.068*"simple" + 0.068*"easy"'),
 (3,
  '0.040*"think" + 0.040*"nil" + 0.040*"great" + 0.040*"simple" + 0.040*"easy" + 0.040*"team" + 0.040*"workshop" + 0.040*"recommend" + 0.040*"understand" + 0.040*"presenter"')]

In [106]:
table_9 = pd.DataFrame()
table_9['Topics'] = ['offer refresher courses', 'great content, easy to understand', 'no issues having it online', 'will recommend to others']
table_9.to_csv('../results/tables/LDA_9.csv',  index=False)

In [57]:
#12
post_df.future_topics.unique()

array(['Business writing & leadership ',
       'Project planning, workforce planning event planning',
       'change management and project risk management ',
       'finding Science (research and education) based grant opportunities in Canada',
       'The importance of setting-up and maintaining the information necessary to do a "Return on Investment" analysis so that it can be easily utilized at/near the end of a project.',
       'Agile Project Management, Jira tool Implementation, Change and Risk Management',
       'more indepth info on all the topics please!!! ', nan,
       'How to prepare budget for program that consists:\n1. budget for individual event or activity \n2. budget for administrative costs\n3. budget for sub-contractor costs\n4. others',
       'It would be great to get more in depth on all above listed topics. Change management would be a great option for me as my organization keeps growing and developing.',
       'Project Communication planning.',
       'Why t

In [59]:
q12_df = pd.DataFrame()
q12_df = post_df[["future_topics"]]

q12_df = q12_df.dropna(subset=['future_topics'])
q12_df["pp_text"] = q12_df["future_topics"].apply(preprocess)

q12_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q12_df["pp_text"])
]
q12_df

Unnamed: 0,future_topics,pp_text
0,Business writing & leadership,business writing leadership
1,"Project planning, workforce planning event pla...",project planning workforce planning event plan...
2,change management and project risk management,change management project risk management
3,finding Science (research and education) based...,find science research education base grant opp...
4,The importance of setting-up and maintaining t...,importance set maintain information necessary ...
5,"Agile Project Management, Jira tool Implementa...",agile project management jira tool implementat...
6,more indepth info on all the topics please!!!,indepth info topic
8,How to prepare budget for program that consist...,prepare budget program consist budget individu...
9,It would be great to get more in depth on all ...,great depth list topic change management great...
10,Project Communication planning.,project communication planning


In [60]:
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
preprocessed_corpus = [doc.split() for doc in q12_df["pp_text"].tolist()]
dictionary = corpora.Dictionary(preprocessed_corpus)
# Optional
# dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=100000)
len(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])


from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())
    
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

Unnamed: 0,Coherence score
3,0.489529
4,0.476714
5,0.493293
6,0.526532
7,0.550014
8,0.470674


In [64]:
lda7 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=7,
    random_state=42,
    passes=17,
)
topic_table_12 = pd.DataFrame(lda7.print_topics(), columns=["Topic id", "Topic words"])
topic_table_12

Unnamed: 0,Topic id,Topic words
0,0,"0.148*""project"" + 0.054*""like"" + 0.054*""story""..."
1,1,"0.087*""different"" + 0.059*""project"" + 0.059*""r..."
2,2,"0.100*""management"" + 0.081*""budget"" + 0.061*""c..."
3,3,"0.115*""planning"" + 0.042*""event"" + 0.042*""rese..."
4,4,"0.014*""project"" + 0.014*""topic"" + 0.014*""busin..."
5,5,"0.051*""project"" + 0.051*""analysis"" + 0.051*""re..."
6,6,"0.014*""project"" + 0.014*""workshop"" + 0.014*""di..."


In [65]:
lda7.print_topics()

[(0,
  '0.148*"project" + 0.054*"like" + 0.054*"story" + 0.054*"tip" + 0.054*"success" + 0.054*"past" + 0.054*"communication" + 0.054*"analysis" + 0.054*"planning" + 0.007*"topic"'),
 (1,
  '0.087*"different" + 0.059*"project" + 0.059*"regular" + 0.059*"workshop" + 0.031*"tool" + 0.031*"depth" + 0.031*"business" + 0.031*"leadership" + 0.031*"writing" + 0.031*"option"'),
 (2,
  '0.100*"management" + 0.081*"budget" + 0.061*"change" + 0.042*"cost" + 0.042*"risk" + 0.042*"great" + 0.042*"project" + 0.022*"topic" + 0.022*"contractor" + 0.022*"program"'),
 (3,
  '0.115*"planning" + 0.042*"event" + 0.042*"research" + 0.042*"opportunity" + 0.042*"canada" + 0.042*"base" + 0.042*"find" + 0.042*"grant" + 0.042*"education" + 0.042*"science"'),
 (4,
  '0.014*"project" + 0.014*"topic" + 0.014*"business" + 0.014*"leadership" + 0.014*"planning" + 0.014*"writing" + 0.014*"info" + 0.014*"indepth" + 0.014*"communication" + 0.014*"change"'),
 (5,
  '0.051*"project" + 0.051*"analysis" + 0.051*"return" + 0.

In [63]:
post_df.future_topics.unique()

array(['Business writing & leadership ',
       'Project planning, workforce planning event planning',
       'change management and project risk management ',
       'finding Science (research and education) based grant opportunities in Canada',
       'The importance of setting-up and maintaining the information necessary to do a "Return on Investment" analysis so that it can be easily utilized at/near the end of a project.',
       'Agile Project Management, Jira tool Implementation, Change and Risk Management',
       'more indepth info on all the topics please!!! ', nan,
       'How to prepare budget for program that consists:\n1. budget for individual event or activity \n2. budget for administrative costs\n3. budget for sub-contractor costs\n4. others',
       'It would be great to get more in depth on all above listed topics. Change management would be a great option for me as my organization keeps growing and developing.',
       'Project Communication planning.',
       'Why t

In [101]:
table_12 = pd.DataFrame()
table_12['Topics'] = ['Project planning and communication strategies', 'More indepth info and use of different pm tools (JIRA, project libre etc)', 
                      'risk management, return on investment analysis, budgeting', 'grant opportunities in research and education', 
                      'more tips and success stories of past projects']
table_12.to_csv('../results/tables/LDA_12.csv',  index=False)

In [67]:
#19
post_df.facilitator_Comments.unique()

array([nan, 'Awesome!', 'nothing :) ',
       "can't think of anything right now",
       'I enjoyed the sense of camaraderie that was created in the workshop...we are all in this together.',
       'Everyone is well trained', 'Nice, open and very easy going! ',
       'Nil',
       'Facilitators were very knowledgeable and great at answering questions.',
       'They were great, they did not pressure the attendees to talk much or to have to answer any particular question.',
       'Thank you.'], dtype=object)

In [69]:
q19_df = pd.DataFrame()
q19_df = post_df[["facilitator_Comments"]]

q19_df = q19_df.dropna(subset=['facilitator_Comments'])
q19_df["pp_text"] = q19_df["facilitator_Comments"].apply(preprocess)

q19_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q19_df["pp_text"])
]
q19_df

Unnamed: 0,facilitator_Comments,pp_text
1,Awesome!,awesome
2,nothing :),
3,can't think of anything right now,think
4,I enjoyed the sense of camaraderie that was cr...,enjoy sense camaraderie create workshop
5,Everyone is well trained,train
6,"Nice, open and very easy going!",nice open go
8,Nil,nil
9,Facilitators were very knowledgeable and great...,facilitator knowledgeable great answer question
11,"They were great, they did not pressure the att...",great pressure attendee talk answer particular...
12,Thank you.,thank


In [70]:
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
preprocessed_corpus = [doc.split() for doc in q19_df["pp_text"].tolist()]
dictionary = corpora.Dictionary(preprocessed_corpus)
# Optional
# dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=100000)
len(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])


from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())
    
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

Unnamed: 0,Coherence score
3,0.611269
4,0.609579
5,0.610526
6,0.608795
7,0.610591
8,0.611057


In [74]:
lda3 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=3,
    random_state=42,
    passes=17,
)
topic_table_19 = pd.DataFrame(lda3.print_topics(), columns=["Topic id", "Topic words"])
topic_table_19

Unnamed: 0,Topic id,Topic words
0,0,"0.100*""enjoy"" + 0.100*""sense"" + 0.100*""worksho..."
1,1,"0.115*""question"" + 0.115*""answer"" + 0.115*""gre..."
2,2,"0.100*""nice"" + 0.100*""open"" + 0.100*""go"" + 0.1..."


In [76]:
lda3.print_topics()

[(0,
  '0.100*"enjoy" + 0.100*"sense" + 0.100*"workshop" + 0.100*"create" + 0.100*"camaraderie" + 0.100*"awesome" + 0.025*"train" + 0.025*"nil" + 0.025*"thank" + 0.025*"think"'),
 (1,
  '0.115*"question" + 0.115*"answer" + 0.115*"great" + 0.066*"pressure" + 0.066*"particular" + 0.066*"attendee" + 0.066*"talk" + 0.066*"facilitator" + 0.066*"knowledgeable" + 0.065*"train"'),
 (2,
  '0.100*"nice" + 0.100*"open" + 0.100*"go" + 0.100*"think" + 0.100*"thank" + 0.100*"nil" + 0.025*"train" + 0.025*"awesome" + 0.025*"knowledgeable" + 0.025*"facilitator"')]

In [73]:
post_df.facilitator_Comments.unique()

array([nan, 'Awesome!', 'nothing :) ',
       "can't think of anything right now",
       'I enjoyed the sense of camaraderie that was created in the workshop...we are all in this together.',
       'Everyone is well trained', 'Nice, open and very easy going! ',
       'Nil',
       'Facilitators were very knowledgeable and great at answering questions.',
       'They were great, they did not pressure the attendees to talk much or to have to answer any particular question.',
       'Thank you.'], dtype=object)

In [102]:
table_19 = pd.DataFrame()
table_19['Topics'] = ['awesome, good sense of camraderie', 'did not pressure attendees', 'knowledgeable, well trained facilitators']
table_19.to_csv('../results/tables/LDA_19.csv',  index=False)

In [78]:
#20
post_df.slack_recommendations.unique()

array(['It was fine / good ',
       'I honestly did use Slack I used the chat and tools in Zoom',
       "I didn't use slack a lot because our organization used Microsoft teams already which has some similarities. ",
       "I didn't have time to explore slack",
       'I did not like and have never used SLACK plus did not have the time to explore its effectiveness.  ',
       'Any tool like Miro would be more useful',
       'formulating small groups on the different headings being discussed and share info or experiences in those chats, between the actual class time',
       nan, 'Nil',
       'Maybe do an exercise at the beginning involving slack so everyone get used to use it, I know there is the presentations but I mean something more into the workshop itself. \n',
       'More interactions on there between participants.'], dtype=object)

In [81]:
q20_df = pd.DataFrame()
q20_df = post_df[["slack_recommendations"]]

q20_df = q20_df.dropna(subset=['slack_recommendations'])
q20_df["pp_text"] = q20_df["slack_recommendations"].apply(preprocess)

# q20_df["pp_text"] = [
#     preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q20_df["pp_text"])
# ]
q20_df

Unnamed: 0,slack_recommendations,pp_text
0,It was fine / good,It was fine good
1,I honestly did use Slack I used the chat and t...,I honestly did use Slack I used the chat and t...
2,I didn't use slack a lot because our organizat...,I didn't use slack a lot because our organizat...
3,I didn't have time to explore slack,I didn't have time to explore slack
4,I did not like and have never used SLACK plus ...,I did not like and have never used SLACK plus ...
5,Any tool like Miro would be more useful,Any tool like Miro would be more useful
6,formulating small groups on the different head...,formulating small groups on the different head...
8,Nil,Nil
11,Maybe do an exercise at the beginning involvin...,Maybe do an exercise at the beginning involvin...
12,More interactions on there between participants.,More interactions on there between participants.


In [82]:
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
preprocessed_corpus = [doc.split() for doc in q20_df["pp_text"].tolist()]
dictionary = corpora.Dictionary(preprocessed_corpus)
# Optional
# dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=100000)
len(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])


from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())
    
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

Unnamed: 0,Coherence score
3,0.385313
4,0.382916
5,0.464344
6,0.58037
7,0.41697
8,0.711439


In [86]:
lda4 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=4,
    random_state=42,
    passes=17,
)
topic_table_20 = pd.DataFrame(lda4.print_topics(), columns=["Topic id", "Topic words"])
topic_table_20

Unnamed: 0,Topic id,Topic words
0,0,"0.053*""the"" + 0.037*""I"" + 0.036*""more"" + 0.021..."
1,1,"0.049*""have"" + 0.049*""I"" + 0.034*""did"" + 0.034..."
2,2,"0.057*""the"" + 0.039*""and"" + 0.039*""in"" + 0.039..."
3,3,"0.037*""participants."" + 0.037*""interactions"" +..."


In [87]:
lda4.print_topics()

[(0,
  '0.053*"the" + 0.037*"I" + 0.036*"more" + 0.021*"slack" + 0.021*"to" + 0.020*"use" + 0.020*"used" + 0.020*"there" + 0.020*"tool" + 0.020*"Any"'),
 (1,
  '0.049*"have" + 0.049*"I" + 0.034*"did" + 0.034*"used" + 0.034*"not" + 0.034*"explore" + 0.034*"time" + 0.034*"didn\'t" + 0.034*"to" + 0.034*"slack"'),
 (2,
  '0.057*"the" + 0.039*"and" + 0.039*"in" + 0.039*"I" + 0.022*"time" + 0.022*"on" + 0.022*"between" + 0.022*"tools" + 0.022*"Zoom" + 0.022*"chat"'),
 (3,
  '0.037*"participants." + 0.037*"interactions" + 0.037*"More" + 0.037*"It" + 0.037*"was" + 0.037*"fine" + 0.037*"good" + 0.037*"between" + 0.037*"on" + 0.037*"there"')]

In [85]:
post_df.slack_recommendations.unique()

array(['It was fine / good ',
       'I honestly did use Slack I used the chat and tools in Zoom',
       "I didn't use slack a lot because our organization used Microsoft teams already which has some similarities. ",
       "I didn't have time to explore slack",
       'I did not like and have never used SLACK plus did not have the time to explore its effectiveness.  ',
       'Any tool like Miro would be more useful',
       'formulating small groups on the different headings being discussed and share info or experiences in those chats, between the actual class time',
       nan, 'Nil',
       'Maybe do an exercise at the beginning involving slack so everyone get used to use it, I know there is the presentations but I mean something more into the workshop itself. \n',
       'More interactions on there between participants.'], dtype=object)

In [107]:
table_20 = pd.DataFrame()
table_20['Topics'] = ["didn't use slack", 'formulate small groups on slack channel for discussion during sessions', 
                      'Other recommendation like Miro', 'maybe short intro on slack to get use to it.']
table_20.to_csv('../results/tables/LDA_20.csv', index=False)

In [89]:
## 22
post_df.general_comments.unique()

array(['Thank you and it was a pleasure knowing you all since last few weeks ',
       nan, 'thanks again!',
       'I enjoyed the sense of camaraderie that was created in the workshop...we are all in this together.',
       'Yes I give my consent',
       'Toye - I really enjoyed participating in this workshop. I was able to get a different perspective on project management use in the non-profit sector. The facilitators were well prepared and engaging during the sessions too. I recommend it for sure.',
       'The workshop was very interesting and useful, although I had some experience in project manager I learned new things and tips, besides I think I will be using the material that PMV gave me during this workshop for many of my projects. Finally, I would like to say that the facilitators were very nice and I can see they have more than enough experience and are knowledgeable in this regards.  \nThank you PMV!  '],
      dtype=object)

In [92]:
q22_df = pd.DataFrame()
q22_df = post_df[["general_comments"]]

q22_df = q22_df.dropna(subset=['general_comments'])
q22_df["pp_text"] = q22_df["general_comments"].apply(preprocess)

q22_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q22_df["pp_text"])
]
q22_df

Unnamed: 0,general_comments,pp_text
0,Thank you and it was a pleasure knowing you al...,thank pleasure know week
3,thanks again!,thank
4,I enjoyed the sense of camaraderie that was cr...,enjoy sense camaraderie create workshop
5,Yes I give my consent,yes consent
10,Toye - I really enjoyed participating in this ...,toye enjoy participate workshop able different...
11,"The workshop was very interesting and useful, ...",workshop interesting useful experience project...


In [93]:
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
preprocessed_corpus = [doc.split() for doc in q22_df["pp_text"].tolist()]
dictionary = corpora.Dictionary(preprocessed_corpus)
# Optional
# dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=100000)
len(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])


from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())
    
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

Unnamed: 0,Coherence score
3,0.550439
4,0.690112
5,0.708824
6,0.676077
7,0.573878
8,0.612358


In [97]:
lda5 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=4,
    random_state=42,
    passes=10,
)
pd.DataFrame(lda5.print_topics(), columns=["Topic id", "Topic words"])

Unnamed: 0,Topic id,Topic words
0,0,"0.078*""camaraderie"" + 0.078*""sense"" + 0.078*""c..."
1,1,"0.123*""thank"" + 0.070*""pleasure"" + 0.070*""week..."
2,2,"0.042*""workshop"" + 0.042*""project"" + 0.042*""fa..."
3,3,"0.064*""project"" + 0.064*""workshop"" + 0.064*""ex..."


In [98]:
lda5.print_topics()

[(0,
  '0.078*"camaraderie" + 0.078*"sense" + 0.078*"create" + 0.078*"enjoy" + 0.078*"workshop" + 0.016*"thank" + 0.016*"yes" + 0.016*"consent" + 0.016*"know" + 0.016*"week"'),
 (1,
  '0.123*"thank" + 0.070*"pleasure" + 0.070*"week" + 0.070*"know" + 0.070*"yes" + 0.070*"consent" + 0.014*"workshop" + 0.014*"enjoy" + 0.014*"create" + 0.014*"project"'),
 (2,
  '0.042*"workshop" + 0.042*"project" + 0.042*"facilitator" + 0.042*"toye" + 0.042*"enjoy" + 0.042*"different" + 0.042*"perspective" + 0.042*"sector" + 0.042*"participate" + 0.042*"use"'),
 (3,
  '0.064*"project" + 0.064*"workshop" + 0.064*"experience" + 0.064*"pmv" + 0.037*"thank" + 0.036*"facilitator" + 0.036*"interesting" + 0.036*"material" + 0.036*"new" + 0.036*"like"')]

In [96]:
post_df.general_comments.unique()

array(['Thank you and it was a pleasure knowing you all since last few weeks ',
       nan, 'thanks again!',
       'I enjoyed the sense of camaraderie that was created in the workshop...we are all in this together.',
       'Yes I give my consent',
       'Toye - I really enjoyed participating in this workshop. I was able to get a different perspective on project management use in the non-profit sector. The facilitators were well prepared and engaging during the sessions too. I recommend it for sure.',
       'The workshop was very interesting and useful, although I had some experience in project manager I learned new things and tips, besides I think I will be using the material that PMV gave me during this workshop for many of my projects. Finally, I would like to say that the facilitators were very nice and I can see they have more than enough experience and are knowledgeable in this regards.  \nThank you PMV!  '],
      dtype=object)

In [108]:
table_22 = pd.DataFrame()
table_22['Topics'] = ["enjoyed participating in the workshop", 'Got a different perspective on project management in NPO sector', 
                      'facilitators were well prepared and knowledgeable', 'will be using this material for future projects.']
table_22.to_csv('../results/tables/LDA_22.csv', index=False)