In [None]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv("choiceboard_data.csv")
df.head()

Unnamed: 0,Subj #,Unnamed: 1,Win 1_2,Win 3_4,Win 5_6,Win 7_8,Win 9_10,Unnamed: 7,Spr 1_2,Spr 3_4,Spr 5_6,Spr 7-8,Spr 9-10,Unnamed: 13,CPD_Q1,CPD_Q2,CPD_Q3
0,1,,Took the time to do some meal prep last night ...,I spent time before work to do an easy 20-minu...,Took a nap before work since my team this week...,Called a friend who happened to be on her lunc...,Took a walk outside to take in some of the rar...,,Had a very busy weekend with my sister visitin...,Spent some time on the phone catching up with ...,Gave myself time to lie down on the couch and ...,Took a nap after class and ended up sleeping t...,Set aside and spent some time at night after s...,,The class time devoted to health and wellness ...,During my gap years I gained a lot of insight ...,I believe self-care to be really important in ...
1,2,,1. Select one of the activities that resonates...,One of the activities that I chose from the Ch...,One of the activities that I chose for self-ca...,"For my self care, I went on a walk and listene...",One of the activities that I chose for health/...,,I decided to spend my self care time by going ...,"For my self care, I decided to find new music ...",I went on a run!,I watched TV with my dad!,I painted my nails! This is something I really...,,The class time devoted to health and wellness ...,The way I decided which self-care to choose wa...,The time spent on my choice of self-care allow...
2,3,,"Today, I cooked myself lunch and it was delici...",I took a walk around campus with my roommate t...,I listened to some of my favorite artists whil...,"Today, I spent my break eating good food and w...","Today, I spent some of my time watching a show...",,I took the time to finish building my baby Yod...,I took the time to reach out and talk to one o...,This week I spent at least 20 minutes reading ...,This week I spent about an hour getting dinner...,"This weekend, I went shopping with my friend a...",,The class time has reminded me on the importan...,To decide on what to do it would mostly depend...,The time spent on my choice of self care affec...
3,4,,I worked on my Paint by Diamond piece.,I worked on more of the Paint by Diamonds.,I worked on my Paint by Diamond while listenin...,I worked on my Paint by Diamond while listenin...,I grabbed lunch with a friend.,,I started a new Asian drama (tv show) called T...,I took a walk around my neighborhood while lis...,"I took a walk to my neighborhood park , sat on...",I went to go visit my high school teachers.,I went out and had lunch with my brother.,,The class time that was devoted to health and ...,"For me, it depends on the weather à if it was ...",It made me more relaxed and less stressed abou...
4,5,,I took a 20 minutes walk from the train statio...,I met up with my friends for coffee and it was...,I soaked in the bathtub for an hour with relax...,I walked around the neighborhood for thirty mi...,I tried to using peloton in our apartment bui...,,Yesterday was my birthday and my boyfriend pla...,I am increasing my cardio work out in the apar...,I went to watch movie at ipic theatre in redmo...,I took a 30 minutes walk in my new neighborhoo...,It was a nice day today and my friend and I to...,,I have learnt to listen to people without inte...,This is because I wanted to connect with mysel...,This has been a lifeline because I always felt...


In [None]:
df_cpd = df[["CPD_Q1", "CPD_Q3"]]
df_cpd.head()

Unnamed: 0,CPD_Q1,CPD_Q3
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...


In [None]:
corpus_1 = df_cpd["CPD_Q1"].dropna()
corpus_1 = corpus_1.astype(str)
corpus_1

0      The class time devoted to health and wellness ...
1      The class time devoted to health and wellness ...
2      The class time has reminded me on the importan...
3      The class time that was devoted to health and ...
4      I have learnt to listen to people without inte...
                             ...                        
103    It’s been a wonderful change of pace to have a...
104    The mentor team discussions with my team and p...
105    The class time devoted to health, wellness, an...
106    I personally think it was a nice break, but I ...
107    I actively disliked most of the content relate...
Name: CPD_Q1, Length: 105, dtype: object

In [None]:
#adding clearly neutral stopwords that are specific to this program or activity
stopwords = nltk.corpus.stopwords.words('english')

stopwords.append('pharmacy')
stopwords.append('pharmacist')
stopwords.append('pharmacists')
stopwords.append('health')
stopwords.append('healthcare')
stopwords.append('practitioner')
stopwords.append('mental')
stopwords.append('wellness')
stopwords.append('self-care')
stopwords.append('mentor')
stopwords.append('WIP')
stopwords.append('class')
stopwords.append('school')
stopwords.append('peers')
stopwords.append('classmates')
stopwords.append('quarter')
stopwords.append('spring')
stopwords.append('winter')
stopwords.append('fall')

stopwords = list(stopwords)

In [None]:
#using sample of words - ~71% of full corpus
sample = pd.DataFrame(corpus_1.sample(75)).reset_index()[['CPD_Q1']]
sample['CPD_Q1'] = sample['CPD_Q1'].map(lambda x: x.lower())
sample.head()

Unnamed: 0,CPD_Q1
0,at times it felt somewhat didactic and inorgan...
1,it has been helpful in that the class has crea...
2,i found the time devoted to health and wellnes...
3,the most important thing i realized by having ...
4,i have found that the mentor team discussions ...


In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = sample.CPD_Q1.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30]) #first 30 words of the first comment

['at', 'times', 'it', 'felt', 'somewhat', 'didactic', 'and', 'inorganic', 'think', 'next', 'year', 'it', 'would', 'help', 'to', 'have', 'more', 'thoughtful', 'exercises', 'that', 'require', 'our', 'active', 'involvement', 'in', 'class', 'team', 'building', 'exercises', 'would']


In [None]:
#bigrams
#phrases that go together, like "new york" (bigram)

#build the models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases

#fast way to get bigrams and trigrams
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
nlp = spacy.load("en_core_web_sm")

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in doc if word not in set(stopwords.words('english'))] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0][:30])

['time', 'feel', 'somewhat', 'didactic', 'inorganic', 'think', 'next', 'year', 'help', 'thoughtful', 'exercise', 'require', 'active', 'involvement', 'class', 'team', 'building', 'exercise', 'valuable']
