# An Exploration of UW Pharmacy Student Reflections

Author: Marlon Fu (marlonfu@berkeley.edu)

## Libraries and Dependencies

In [9]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

import spacy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jergx\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jergx\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Import and Pre-process Data

In [10]:
data = pd.read_csv("data/choiceboard_data.csv")[["CPD_Q1", "CPD_Q3"]]
data.head()

Unnamed: 0,CPD_Q1,CPD_Q3
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...


In [15]:
data_Q1 = data[["CPD_Q1"]].dropna()
data_Q1 = data_Q1.astype(str)
data_Q1

Unnamed: 0,CPD_Q1
0,The class time devoted to health and wellness ...
1,The class time devoted to health and wellness ...
2,The class time has reminded me on the importan...
3,The class time that was devoted to health and ...
4,I have learnt to listen to people without inte...
...,...
103,It’s been a wonderful change of pace to have a...
104,The mentor team discussions with my team and p...
105,"The class time devoted to health, wellness, an..."
106,"I personally think it was a nice break, but I ..."


In [5]:
#adding clearly neutral stopwords that are specific to this program or activity
stopwords = nltk.corpus.stopwords.words('english')

# stopwords.append('pharmacy')
# stopwords.append('pharmacist')
# stopwords.append('pharmacists')
# stopwords.append('health')
# stopwords.append('healthcare')
# stopwords.append('practitioner')
# stopwords.append('mental')
# stopwords.append('wellness')
# stopwords.append('self-care')
# stopwords.append('mentor')
# stopwords.append('WIP')
# stopwords.append('class')
# stopwords.append('school')
# stopwords.append('peers')
# stopwords.append('classmates')
# stopwords.append('quarter')
# stopwords.append('spring')
# stopwords.append('winter')
# stopwords.append('fall')

stopwords = list(stopwords)

In [16]:
data_Q1['CPD_Q1'] = data_Q1['CPD_Q1'].map(lambda x: re.sub('[,\.!?]', '', x))
data_Q1['CPD_Q1'] = data_Q1['CPD_Q1'].map(lambda x: x.lower())
data_Q1.head()

Unnamed: 0,CPD_Q1
0,the class time devoted to health and wellness ...
1,the class time devoted to health and wellness ...
2,the class time has reminded me on the importan...
3,the class time that was devoted to health and ...
4,i have learnt to listen to people without inte...


In [17]:
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = data_Q1.CPD_Q1.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['the', 'class', 'time', 'devoted', 'to', 'health', 'and', 'wellness', 'and', 'self', 'care', 'has', 'had', 'positive', 'effect', 'on', 'me', 'while', 'am', 'fairly', 'good', 'at', 'finding', 'pockets', 'of', 'time', 'for', 'self', 'care', 'myself']


### Word and Character Count Distribution