In [10]:
import pandas as pd
import numpy as np
import altair as alt
import spacy
# !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [2]:
# load pre survey data for 2021 
pre_df = pd.read_csv("../data/processed/pre_survey.csv")

### Dataset exploration

In [3]:
pre_df.head()

Unnamed: 0,ID,Email,Name,question_1,question_2,question_3,question_4,question_5,question_6,question_7,question_8
0,1,anonymous,,Project Manager;Subject Matter Expert (SME);,Yes I have attended Project Management Trainin...,Multiple Projects,I have been involved in various Projects and I...,"Migration of CRM. In this Project, we are tryi...",5,"Strategic, Analytical and Collaborative",Maintaining Project on Track with respect to r...
1,2,anonymous,,Project Manager;,No,Multiple Projects,"I manage multiple program schedules and teams,...",Phase 2 Renovations. Our organization is prepa...,45,"Commitment to the project and a unified goal, ...",Reduced efficiency and timeline drag due from ...
2,3,anonymous,,Not sure;non profit doing projects? ;,Nope!,Multiple Projects,Multiple community based projects (event plann...,"We are currently doing a community wide ""Needs...",10-May,Not sure if TEAM is an acronym for something h...,- taking the ideas and translating them into t...
3,4,anonymous,,Project Manager;,I had some training during my master's degree ...,Multiple Projects,Personal & professional,I am looking after Immunization Uptake Project...,3,commitment,Working with multiple stakeholders & communities
4,5,anonymous,,Project Team Member;,No\n,Multiple Projects,Funded/fund-raising/awareness events and year ...,Funded/fund-raising/awareness events and year ...,04-Feb,Vision and intend.,Understanding communication styles.


In [4]:
pre_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          6 non-null      int64  
 1   Email       6 non-null      object 
 2   Name        0 non-null      float64
 3   question_1  6 non-null      object 
 4   question_2  6 non-null      object 
 5   question_3  6 non-null      object 
 6   question_4  6 non-null      object 
 7   question_5  6 non-null      object 
 8   question_6  6 non-null      object 
 9   question_7  6 non-null      object 
 10  question_8  6 non-null      object 
dtypes: float64(1), int64(1), object(9)
memory usage: 656.0+ bytes


In [5]:
pre_df = pre_df.rename(columns={"question_1": "attendee_roles", 
                       "question_2": "previous_attendenance", 
                       "question_3": "work_project_types", 
                       "question_4": "desc_mp",
                       "question_5": "proj_descp",
                      "question_6": "team_num",
                      "question_7": "team_strength",
                      "question_8": "proj_challenges"})

In [6]:
pre_df.shape

(6, 11)

There are 6 rows and 11 columns in the original dataset.

In [None]:
pre_df

In [None]:
### data cleaning 
pre_df = pre_df.replace(to_replace ="10-May",
                 value ="5-10")
pre_df = pre_df.replace(to_replace ="04-Feb",
                 value ="2-4")

pre_df['mp_team_cat'] = ['below 5', 'above 20', '6-10', 'below 5', 'below 5', 'NA']
pre_df['prev_att_cat'] = ['Yes', 'No', 'No', 'Yes', 'No', 'Yes']

### Question 1: Do you consider yourself to be one or more of the following roles (please select all that apply

In [None]:
pre_df.attendee_roles.unique()

In [None]:
q1_df = pre_df[["ID", "attendee_roles"]]
q1_df_new = q1_df.copy()

In [None]:
q1_df_new = q1_df_new.assign(attendee_roles=q1_df_new.attendee_roles.str.split(";")).explode('attendee_roles')
q1_df_new['attendee_roles'].replace('', np.nan, inplace=True)
q1_df_new.dropna(subset=['attendee_roles'], inplace=True)

In [None]:
q1_df_new

In [None]:
bar = alt.Chart(q1_df_new, title="Current roles of workshop participants, (n=6)"
               ).transform_joinaggregate(
    total='count()',
    groupby=['attendee_roles']
).transform_calculate(
    percent=alt.datum.total/6
).mark_bar().encode(
    x= alt.X('percent:Q', title = "Response percerntage (%)", axis=alt.Axis(format='%')),
    y=alt.Y('attendee_roles', sort='x', title="")
    ).properties(
    width=400,
    height=200
    )

text = alt.Chart(q1_df_new, title="Current roles of workshop participants, (n=6)"
               ).transform_joinaggregate(
    total='count()',
    groupby=['attendee_roles']
).transform_calculate(
    percent=alt.datum.total/6
).mark_text(
        align='right',
    ).encode(
        y = alt.Y('attendee_roles', sort = 'x'),
        x = alt.X('percent:Q'),
        text=alt.Text('percent:Q',format='.1%')
).properties(
    width=400,
    height=200
    )

q1_plot = alt.layer(bar, text, data=q1_df_new)

# Show the plot|
q1_plot

### Question 2: Have you ever attended a course or viewed an educational webinar about project management?  If so please describe this training.

In [None]:
pre_df.previous_attendenance.unique()

In [None]:
yes_bool = pre_df["previous_attendenance"].str.contains('Yes|some')

In [None]:
yes_bool.value_counts(normalize=True)

In [None]:
bar = alt.Chart(pre_df, title="Previous experience with project manangement content, (n=6)"
               ).transform_joinaggregate(
    total='count()',
    groupby=['prev_att_cat']
).transform_calculate(
    percent=alt.datum.total/6
).mark_bar().encode(
    x= alt.X('percent:Q', title = "Response percerntage (%)", axis=alt.Axis(format='%')),
    y=alt.Y('prev_att_cat', sort='x', title="")
    ).properties(
    width=400,
    height=100
    )

text = alt.Chart(pre_df, title="Previous experience with project manangement content, (n=6)"
               ).transform_joinaggregate(
    total='count()',
    groupby=['prev_att_cat']
).transform_calculate(
    percent=alt.datum.total/6
).mark_text(
        align='right',
    ).encode(
        y = alt.Y('prev_att_cat', sort = 'x'),
        x = alt.X('percent:Q'),
        text=alt.Text('percent:Q',format='.1%')
).properties(
    width=400,
    height=100
    )

q2_plot = alt.layer(bar, text, data=pre_df)

# Show the plot|
q2_plot

### Question 3: At work are you typically focused on one project at a time or multiple projects?

In [None]:
pre_df.work_project_types.value_counts()

### Question 4: If multiple projects, please describe or type N/A for not applicable?

In [7]:
pre_df.desc_mp.value_counts()

I have been involved in various Projects and I have to be agile in doing the Project Management.                                                                                                                                                                                                              1
I manage multiple program schedules and teams, lead group projects with several committees, coordinate daily volunteers, direct and coordinate administrative teams and am the primary driving force for our annual action plans/strategic plan. I have one staff member and everyone else are volunteers.    1
Multiple community based projects (event planning, community projects (placemaking), long term and short term projects)                                                                                                                                                                                       1
Personal & professional                                                                 

In [8]:
from preprocessing import *

In [83]:
q4_df = pd.DataFrame()
q4_df = pre_df[["desc_mp"]]
q4_df["pp_text"] = q4_df["desc_mp"].apply(preprocess)
q4_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q4_df["pp_text"])
]
q4_df.dropna()


personal_project_desc = q4_df.pp_text.str.cat(sep = ' ')

doc = nlp(personal_project_desc, disable = ['ner'])

from collections import Counter
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] 

word_freq = Counter(words) 
word_freq = word_freq.most_common(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q4_df["pp_text"] = q4_df["desc_mp"].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q4_df["pp_text"] = [


In [84]:
df_q4 = pd.DataFrame(word_freq, columns =['Term', 'Frequency'])
df_q4.set_index('Term')
df_q4.to_csv('../results/tables/pre_survey_q4.csv', index=False)

In [85]:
# EDA Initial word cloud 
from wordcloud import WordCloud
words = " ".join(q4_df["pp_text"].tolist())
wordcloud = WordCloud(width=800, height=400, random_state=591)
wc = wordcloud.generate(words).to_image()
wc.save('../results/images/q4_wc.png')

### Question 5: Participant personal project description

In [86]:
pre_df.proj_descp.value_counts()

Migration of CRM. In this Project, we are trying to migrate an old Access CRM to a new CRM-Which is more sophisticated and easy to use.                                                                                                                                                                                                                           1
Phase 2 Renovations. Our organization is preparing for the final phase of renovations to our new facility and my role will involve coordination of facility oversight committees, contractors, grant and fundraising teams, consultants and volunteers while working alongside Island Health and other regulatory bodies.                                         1
We are currently doing a community wide "Needs Assessment" with a UVic researcher. This needs assessment will then be carried into a longer term community wide visioning process in Fall 2022. This project will include community engagement, another partnership with UVic, collaboration wit

In [87]:
q5_df = pd.DataFrame()
q5_df = pre_df[["proj_descp"]]
q5_df["pp_text"] = q5_df["proj_descp"].apply(preprocess)
q5_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q5_df["pp_text"])
]
q5_df.dropna()


project_desc_det = q5_df.pp_text.str.cat(sep = ' ')

doc = nlp(project_desc_det, disable = ['ner'])

from collections import Counter
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] 

word_freq = Counter(words) 
word_freq_1 = word_freq.most_common(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q5_df["pp_text"] = q5_df["proj_descp"].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q5_df["pp_text"] = [


In [88]:
df_q5 = pd.DataFrame(word_freq_1, columns =['Term', 'Frequency'])
df_q5.set_index('Term')
df_q5.to_csv('../results/tables/pre_survey_q5.csv', index=False)

In [89]:
# EDA Initial word cloud 
from wordcloud import WordCloud
words = " ".join(q5_df["pp_text"].tolist())
wordcloud = WordCloud(width=800, height=400, random_state=591)
wc = wordcloud.generate(words).to_image()
wc.save('../results/images/q5_wc.png')

### Question 6: Approximately how many team members, including you, will be active on that particular project?

In [None]:
pre_df.team_num.value_counts()

Note: make these integers or category options in the survey for better analysis

In [None]:
pre_df.mp_team_cat.value_counts()

In [None]:
bar = alt.Chart(pre_df, title="No: of team members involved in current projects, (n=6)"
               ).transform_joinaggregate(
    total='count()',
    groupby=['mp_team_cat']
).transform_calculate(
    percent=alt.datum.total/6
).mark_bar().encode(
    x= alt.X('percent:Q', title = "Response percerntage (%)", axis=alt.Axis(format='%')),
    y=alt.Y('mp_team_cat', sort='x', title="")
    ).properties(
    width=400,
    height=150
    )

text = alt.Chart(pre_df, title="No: of team members involved in current projects, (n=6)"
               ).transform_joinaggregate(
    total='count()',
    groupby=['mp_team_cat']
).transform_calculate(
    percent=alt.datum.total/6
).mark_text(
        align='right',
    ).encode(
        y = alt.Y('mp_team_cat', sort = 'x'),
        x = alt.X('percent:Q'),
        text=alt.Text('percent:Q',format='.1%')
).properties(
    width=400,
    height=150
    )

q7_plot = alt.layer(bar, text, data=pre_df)

# Show the plot|
q7_plot

### Question 7: Please describe any areas of TEAM strength you have identified.

In [90]:
pre_df.team_strength.value_counts()

Strategic, Analytical and Collaborative                                                                                                                                          1
Commitment to the project and a unified goal, expertise in various related fields.                                                                                               1
Not sure if TEAM is an acronym for something here? \nBut strengths are - a motivated, involved, and ambitious community, we have lots of pre-existing research to build on.      1
commitment                                                                                                                                                                       1
Vision and intend.                                                                                                                                                               1
N.A.                                                                                                     

In [91]:
q7_df = pd.DataFrame()
q7_df = pre_df[["team_strength"]]
q7_df["pp_text"] = q7_df["team_strength"].apply(preprocess)
q7_df.dropna()

q7_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q7_df["pp_text"])
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q7_df["pp_text"] = q7_df["team_strength"].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q7_df["pp_text"] = [


In [92]:
# EDA Initial word cloud 
from wordcloud import WordCloud
words = " ".join(q7_df["pp_text"].tolist())
wordcloud = WordCloud(width=800, height=400, random_state=591)
wc = wordcloud.generate(words).to_image()
wc.save('../results/images/q7_wc.png')

In [93]:
team_str_text = q7_df.pp_text.str.cat(sep = ' ')

doc = nlp(team_str_text, disable = ['ner'])


from collections import Counter
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] 

word_freq = Counter(words) 
word_freq_2 = word_freq.most_common(20)

In [94]:
df_q7 = pd.DataFrame(word_freq_2, columns =['Term', 'Frequency'])
df_q7.set_index('Term')
df_q7.to_csv('../results/tables/pre_survey_q7.csv', index=False)

In [38]:
import spacy
# !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [39]:
nlp.max_length = 1850000

In [40]:
pre_df.proj_descp.unique()

array(['Migration of CRM. In this Project, we are trying to migrate an old Access CRM to a new CRM-Which is more sophisticated and easy to use.',
       'Phase 2 Renovations. Our organization is preparing for the final phase of renovations to our new facility and my role will involve coordination of facility oversight committees, contractors, grant and fundraising teams, consultants and volunteers while working alongside Island Health and other regulatory bodies. ',
       'We are currently doing a community wide "Needs Assessment" with a UVic researcher. This needs assessment will then be carried into a longer term community wide visioning process in Fall 2022. This project will include community engagement, another partnership with UVic, collaboration with the City, and reporting out to community and grant funders. ',
       'I am looking after Immunization Uptake Project with MOSAIC ',
       'Funded/fund-raising/awareness events and year long projects.',
       'I would love to par

In [41]:
proj_descp_text = pre_df.proj_descp.str.cat(sep = ' ')

In [42]:
proj_descp_text

'Migration of CRM. In this Project, we are trying to migrate an old Access CRM to a new CRM-Which is more sophisticated and easy to use. Phase 2 Renovations. Our organization is preparing for the final phase of renovations to our new facility and my role will involve coordination of facility oversight committees, contractors, grant and fundraising teams, consultants and volunteers while working alongside Island Health and other regulatory bodies.  We are currently doing a community wide "Needs Assessment" with a UVic researcher. This needs assessment will then be carried into a longer term community wide visioning process in Fall 2022. This project will include community engagement, another partnership with UVic, collaboration with the City, and reporting out to community and grant funders.  I am looking after Immunization Uptake Project with MOSAIC  Funded/fund-raising/awareness events and year long projects. I would love to participate as a trainer volunteer  '

In [43]:
doc = nlp(proj_descp_text, disable = ['ner'])

In [44]:
from collections import Counter
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] 

word_freq = Counter(words) 
word_freq.most_common(20)

[(' ', 4),
 ('community', 4),
 ('CRM', 3),
 ('Project', 2),
 ('new', 2),
 ('phase', 2),
 ('renovation', 2),
 ('facility', 2),
 ('grant', 2),
 ('volunteer', 2),
 ('wide', 2),
 ('need', 2),
 ('assessment', 2),
 ('long', 2),
 ('project', 2),
 ('Migration', 1),
 ('try', 1),
 ('migrate', 1),
 ('old', 1),
 ('Access', 1)]

In [46]:
from spacy.matcher import Matcher 
matcher = Matcher(nlp.vocab) 
pattern = [{'POS':'ADJ'}, {'POS':'NOUN'}] 
matcher.add('ADJ_PHRASE', [pattern]) 
matches = matcher(doc, as_spans=True) 
phrases = [] 
for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)
phrase_freq

Counter({'final phase': 1,
         'new facility': 1,
         'regulatory bodies': 1,
         'uvic researcher': 1,
         'longer term': 1,
         'wide visioning': 1,
         'long projects': 1})

In [47]:
# import necessary modules
import spacy
from spacy.matcher import PhraseMatcher #import PhraseMatcher class
# Language class with the English model 'en_core_web_sm' is loaded
nlp = spacy.load('en_core_web_sm')
# create the PhraseMatcher object
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
# the list containing the pharses to be matched
terminology_list = ["project", "company",                     
                           "assessment"]
# convert the phrases into document object using nlp.make_doc to #speed up.
patterns = [nlp.make_doc(text) for text in terminology_list]
# add the patterns to the matcher object without any callbacks
matcher.add("Phrase Matching", None, *patterns)


In [48]:
#call the matcher object the document object and it will return #match_id, start and stop indexes of the matched words
matches = matcher(doc)
#print the matched results and extract out the results
for match_id, start, end in matches:
    # Get the string representation 
    string_id = nlp.vocab.strings[match_id]  
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

11356100181062323261 Phrase Matching 6 7 Project
11356100181062323261 Phrase Matching 90 91 Assessment
11356100181062323261 Phrase Matching 99 100 assessment
11356100181062323261 Phrase Matching 117 118 project
11356100181062323261 Phrase Matching 149 150 Project


### Question 8: What common and/or recurring issues and challenges do you encounter while being part of a project? (list and briefly describe TOP three if you can)

In [95]:
pre_df.proj_challenges.unique()

array(['Maintaining Project on Track with respect to resources, timelines and Budget. Mitigating Risks. Adapting agile mindset in the team',
       "Reduced efficiency and timeline drag due from under-structured or unwritten input and approval processes. Chronic games of 'telephone' i.e. the same information being shared with different individuals over and over again with details being lost along the way.\nProject Management availability/overload -I will wind up playing the central role in this project, but I am also expected to keep operations and all other smaller projects moving forward. ",
       "- taking the ideas and translating them into tangible actions/requests. \n- Capacity issues (I'm the only full time staff, everyone else is volunteer / very limited hours contractor) \n- reluctant partners (the City of Victoria) in collaborating on potential actions/requests/community vision ",
       'Working with multiple stakeholders & communities ',
       'Understanding communication

In [96]:
q8_df = pd.DataFrame()
q8_df = pre_df[["proj_challenges"]]
q8_df["pp_text"] = q8_df["proj_challenges"].apply(preprocess)
q8_df.dropna()

q8_df["pp_text"] = [
    preprocess_spacy(spacy_text) for spacy_text in nlp.pipe(q8_df["pp_text"])
]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q8_df["pp_text"] = q8_df["proj_challenges"].apply(preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q8_df["pp_text"] = [


In [97]:
words = " ".join(q8_df["pp_text"].tolist())
wordcloud = WordCloud(width=800, height=400, random_state=591)
wc = wordcloud.generate(words).to_image()
wc.save('../results/images/q8_wc.png')

In [98]:
proj_challenges_text = q8_df.pp_text.str.cat(sep = ' ')

doc = nlp(proj_challenges_text, disable = ['ner'])

from collections import Counter
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] 

word_freq = Counter(words) 
word_freq_3 = word_freq.most_common(20)

In [99]:
df_q8 = pd.DataFrame(word_freq_3, columns =['Term', 'Frequency'])
df_q8.set_index('Term')
df_q8.to_csv('../results/tables/pre_survey_q8.csv', index=False)

### LDA topic modelling 

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [None]:
# Create a vocabulary for the lda model and
# convert our corpus into document-term matrix for Lda
preprocessed_corpus = [doc.split() for doc in q8_df["pp_text"].tolist()]
dictionary = corpora.Dictionary(preprocessed_corpus)
# Optional
# dictionary.filter_extremes(no_below=3, no_above=0.3, keep_n=100000)
len(dictionary)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])


from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())
    
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

lda4 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=4,
    random_state=42,
    passes=10,
)
pd.DataFrame(lda4.print_topics(), columns=["Topic id", "Topic words"])


lda4.print_topics()

In [None]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]
# print(doc_term_matrix[0])
d = [(dictionary[idx], freq) for (idx, freq) in doc_term_matrix[0]]
pd.DataFrame(d, columns=["word", "frequency"])

In [None]:
from gensim.models import CoherenceModel

K = [3, 4, 5, 6, 7, 8]

coherence_scores = []

for num_topics in K:
    lda = LdaModel(
        corpus=doc_term_matrix,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,
    )
    coherence_model_lda = CoherenceModel(
        model=lda, texts=preprocessed_corpus, dictionary=dictionary, coherence="c_v"
    )
    coherence_scores.append(coherence_model_lda.get_coherence())

In [None]:
coherence_df = pd.DataFrame(coherence_scores, index=K, columns=["Coherence score"])
coherence_df

In [None]:
lda4 = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=4,
    random_state=42,
    passes=10,
)
pd.DataFrame(lda4.print_topics(), columns=["Topic id", "Topic words"])

In [None]:
lda4.print_topics()

In [None]:
### creating bi-grams

In [None]:
import nltk
from nltk import bigrams
import networkx as nx

In [None]:
# Create list of lists containing bigrams in tweets
terms_bigram = [list(bigrams(text)) for text in nlp.pipe(q8_df['pp_text'])]

# View bigrams for the first tweet
terms_bigram[0]

In [None]:
import itertools
import collections
# Flatten list of bigrams in clean tweets
bigrams = list(itertools.chain(*terms_bigram))

# Create counter of words in clean bigrams
bigram_counts = collections.Counter(bigrams)

bigram_counts.most_common(20)

In [None]:
bigram_df = pd.DataFrame(bigram_counts.most_common(20),
                             columns=['bigram', 'count'])

bigram_df

In [None]:
### visualize network 
# Create dictionary of bigrams and their counts
d = bigram_df.set_index('bigram').T.to_dict('records')

In [None]:
# Create network plot 
G = nx.Graph()

# Create connections between nodes
for k, v in d[0].items():
    G.add_edge(k[0], k[1], weight=(v * 10))


In [None]:
# import matplotlib.pyplot as plt
# fig, ax = plt.subplots(figsize=(10, 8))

# pos = nx.spring_layout(G, k=2)

# # Plot networks
# nx.draw_networkx(G, pos,
#                  font_size=16,
#                  width=3,
#                  edge_color='grey',
#                  node_color='purple',
#                  with_labels = False,
#                  ax=ax)

# # Create offset labels
# for key, value in pos.items():
#     x, y = value[0]+.135, value[1]+.045
#     ax.text(x, y,
#             s=key,
#             bbox=dict(facecolor='red', alpha=0.25),
#             horizontalalignment='center', fontsize=13)
# plt.show()