## Importing Libraries ##

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re

from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter

from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [5]:
data = pd.read_csv("data/choiceboard_data.csv")[["CPD_Q1", "CPD_Q3"]]
data = data.dropna()
print('There are ' + str(data.shape[0]) + ' rows.')
data.head()

There are 104 rows.


Unnamed: 0,CPD_Q1,CPD_Q3
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...


In [6]:
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Lowercase
    text = text.lower() 

    # Tokenize
    tokens = word_tokenize(text)  

    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens] 
    
    # Return preprocessed list of tokens
    return tokens

data['CPD_Q1 Clean'] = data['CPD_Q1'].apply(preprocess)
data['CPD_Q3 Clean'] = data['CPD_Q3'].apply(preprocess)

data['CPD_Q1 Word Count'] = data['CPD_Q1'].apply(lambda x: len(x.split()))
data['CPD_Q3 Word Count'] = data['CPD_Q3'].apply(lambda x: len(x.split()))

data['CPD_Q1 Clean Word Count'] = data['CPD_Q1 Clean'].apply(lambda x: len(x))
data['CPD_Q3 Clean Word Count'] = data['CPD_Q3 Clean'].apply(lambda x: len(x))
data.head()

Unnamed: 0,CPD_Q1,CPD_Q3,CPD_Q1 Clean,CPD_Q3 Clean,CPD_Q1 Word Count,CPD_Q3 Word Count,CPD_Q1 Clean Word Count,CPD_Q3 Clean Word Count
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...,"[class, time, devot, health, well, self, care,...","[believ, self, care, realli, import, live, hea...",88,37,42,20
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...,"[class, time, devot, health, well, self, care,...","[time, spent, choic, self, care, allow, relax,...",78,26,39,15
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...,"[class, time, remind, import, self, care, also...","[time, spent, choic, self, care, affect, posit...",91,63,39,30
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...,"[class, time, devot, health, well, self, care,...","[made, relax, le, stress, upcom, futur, exam, ...",38,34,16,13
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...,"[learnt, listen, peopl, without, interrupt, le...","[lifelin, alway, felt, like, wast, time, walk,...",58,50,25,23


## Topic-wizard for corpus 1 ##

In [7]:
data.head()
modelling_data = data[['CPD_Q1', 'CPD_Q3']]
modelling_data.iloc[:,0]

corpus = modelling_data["CPD_Q1"].tolist()



In [8]:
#corpus1 
corpus

['The class time devoted to health and wellness and self-care has had a positive effect on me. While I am fairly good at finding pockets of time for self-care myself, class sessions such as the mentor team discussions provided a space for me to engage in something I normally wouldn’t think to do, such as introducing me to new games to play in a group setting. This in particular is enjoyable in that I get to socialize with classmates I don’t normally extensively converse or hang out with.',
 'The class time devoted to health and wellness and self-care really helped me to spend time doing what I enjoy and having class time devoted to this made me realize how important self-care is to ensure success in my personal and professional life. I really appreciate the mentor team discussions because it makes me realize that I’m not alone and that I have the support from my mentor and classmates to help me do well in the program. ',
 'The class time has reminded me on the importance of self care a

In [9]:
from sklearn.datasets import fetch_20newsgroups

#corpus = fetch_20newsgroups(subset="all").data

In [10]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Creating a bag-of-words vectorizer
# We cut low and high frequency words and filter out English stopwords
vectorizer = CountVectorizer(min_df=2, max_df=0.85, stop_words="english")
# We create a topic model with ten topics
topic_model = NMF(n_components=4)
# Then we set up a pipeline
topic_pipeline = Pipeline(
    [
        ("vectorizer", vectorizer),
        ("topic_model", topic_model),
    ]
)

In [11]:
topic_pipeline.fit(corpus)

In [12]:
import topicwizard
topicwizard.visualize(pipeline=topic_pipeline, corpus=corpus)

Preprocessing


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.



divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log



### CORPUS 2 FOR QUESTION 3 ###

In [13]:
#Make the Corpus for Question 3
corpus2 = modelling_data["CPD_Q3"].tolist()

In [14]:
#corpus2 
corpus2

['I believe self-care to be really important in living a healthy and balanced lifestyle, so having time to spend on self-care built into my schedule helped to ease some of my stress for at least the day.',
 'The time spent on my choice of self-care allowed me to relax my mind and step away from work and school responsibilities for a short time.',
 'The time spent on my choice of self care affected me in a positive way because it would be something I enjoyed and part of me would want to continue doing it, but I knew I had to get work done as well. But it was also a nice break from thing, sort of like a reset button to get me energized again. ',
 'It made me more relaxed and less stressed about the upcoming future such as exams and work. For that moment, I don’t have to think about school or work and only think about myself.',
 'This has been a lifeline because I always felt like I was wasting time if I walked away from my computer. I have realized that every time I hang out with my frie

In [15]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# Creating a bag-of-words vectorizer
# We cut low and high frequency words and filter out English stopwords
vectorizer = CountVectorizer(min_df=2, max_df=0.85, stop_words="english")
# We create a topic model with ten topics
topic_model = NMF(n_components=6)
# Then we set up a pipeline
topic_pipeline = Pipeline(
    [
        ("vectorizer", vectorizer),
        ("topic_model", topic_model),
    ]
)

In [16]:
topic_pipeline.fit(corpus2)

In [17]:
import topicwizard
topicwizard.visualize(pipeline=topic_pipeline, corpus=corpus2)

Preprocessing



divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log

