In [None]:
# Read csv
import pandas as pd
df=pd.read_csv("Exit_slip.csv",engine='python',encoding='cp1252',na_filter=False)

In [None]:
'''Drop rows where 'Confused about' column has blanks; Replace 'Confused about' with 'learned' in this cell and all subsequent 
cells if generating topics using the text within the 'learned' column''' 
df = df[df['Confused about'].str.strip() != '']

In [None]:
import string
# Function to check if a string contains only punctuation
def is_only_punctuation(text):
    return all(char in string.punctuation for char in text)
# Remove rows where 'Confused about' contains only punctuation
df = df[~df['Confused about'].apply(is_only_punctuation)]

In [None]:
import regex
df['Confused_about'] = df['Confused about'].str.replace('[^\w\s]','')

In [None]:
# Remove punctuations, stopwords from 'Confused about' and lemmatize text
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
data = df.Confused_about.values.tolist()
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])
import nltk 
nltk.download('words')
words = set(nltk.corpus.words.words())
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['vape','vaping','Vape','Vapes','Vaping','vaped','vapes','vapeing','confused','confuse'])
import spacy 
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

nlp = spacy.load('en_core_web_sm')
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags and len(token)>3])
    return texts_out
data_words_nostops = remove_stopwords(data_words)
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'] )
print(data_lemmatized)

In [None]:
# Create bow corpus
dictionary = gensim.corpora.Dictionary(data_lemmatized)
bow_corpus = [dictionary.doc2bow(doc) for doc in data_lemmatized]

In [None]:
# Find optimal no. of topics using Coherence score
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=gensim.models.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary,random_state=0)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=data_lemmatized, start=2, limit=40, step=1)
limit=40; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Generate topics based on the no. of optimal topics which is '7' in this case 
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary,random_state=0)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
# Find dominant topic in each sentence of the text
def format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=data_lemmatized):
    # Init output
    sent_topics_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=bow_corpus, texts=data_lemmatized)
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

In [None]:
# Generate word cloud for documents belonging to first topic
df_1 = df_dominant_topic[df_dominant_topic['Dominant_Topic'] ==0] 
text_1=df_1['Text']
text_1_st=text_1.to_string()
wordcloud = WordCloud(max_words=50,background_color='#FFFFFF').generate(text_1_st)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Generate word cloud for documents belonging to second topic
df_2 = df_dominant_topic[df_dominant_topic['Dominant_Topic'] ==1] 
text_2=df_2['Text']
text_2_st=text_2.to_string()
wordcloud = WordCloud(max_words=50,background_color='#FFFFFF').generate(text_2_st)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Generate word cloud for documents belonging to third topic
df_3 = df_dominant_topic[df_dominant_topic['Dominant_Topic'] ==2] 
text_3=df_3['Text']
text_3_st=text_3.to_string()
wordcloud = WordCloud(max_words=50,background_color='#FFFFFF').generate(text_3_st)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Generate word cloud for documents belonging to fourth topic
df_4 = df_dominant_topic[df_dominant_topic['Dominant_Topic'] ==3] 
text_4=df_4['Text']
text_4_st=text_4.to_string()
wordcloud = WordCloud(max_words=50,background_color='#FFFFFF').generate(text_4_st)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Generate word cloud for documents belonging to fifth topic
df_5 = df_dominant_topic[df_dominant_topic['Dominant_Topic'] ==4] 
text_5=df_5['Text']
text_5_st=text_5.to_string()
wordcloud = WordCloud(max_words=50,background_color='#FFFFFF').generate(text_5_st)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Generate word cloud for documents belonging to sixth topic
df_6 = df_dominant_topic[df_dominant_topic['Dominant_Topic'] ==5] 
text_6=df_6['Text']
text_6_st=text_6.to_string()
wordcloud = WordCloud(max_words=50,background_color='#FFFFFF').generate(text_6_st)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Generate word cloud for documents belonging to seventh topic
df_7 = df_dominant_topic[df_dominant_topic['Dominant_Topic'] ==6] 
text_7=df_1['Text']
text_7_st=text_7.to_string()
wordcloud = WordCloud(max_words=50,background_color='#FFFFFF').generate(text_7_st)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()