# An Exploration of UW Pharmacy Student Reflections

Author: James Geronimo (jegeronimo@berkeley.edu)

## Libraries and Dependencies

In [131]:
import pandas as pd
import numpy as np
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

import re

from sklearn.feature_extraction.text import TfidfVectorizer

from collections import Counter

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

## Preprocess Data

In [137]:
data = pd.read_csv("data/choiceboard_data.csv")[["CPD_Q1", "CPD_Q3"]]
data = data.astype(str).dropna()
print('There are ' + str(data.shape[0]) + ' rows.')
data.head()

There are 108 rows.


Unnamed: 0,CPD_Q1,CPD_Q3
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...


In [139]:
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Lowercase
    text = text.lower() 

    # Tokenize
    tokens = word_tokenize(text)  

    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]

    # Stemming
    tokens = [stemmer.stem(word) for word in tokens] 
    
    # Return preprocessed list of tokens
    return tokens

data['CPD_Q1 Clean'] = data['CPD_Q1'].apply(preprocess)
data['CPD_Q3 Clean'] = data['CPD_Q3'].apply(preprocess)

data['CPD_Q1 Word Count'] = data['CPD_Q1'].apply(lambda x: len(x.split()))
data['CPD_Q3 Word Count'] = data['CPD_Q3'].apply(lambda x: len(x.split()))

data['CPD_Q1 Clean Word Count'] = data['CPD_Q1 Clean'].apply(lambda x: len(x))
data['CPD_Q3 Clean Word Count'] = data['CPD_Q3 Clean'].apply(lambda x: len(x))
data.head()

Unnamed: 0,CPD_Q1,CPD_Q3,CPD_Q1 Clean,CPD_Q3 Clean,CPD_Q1 Word Count,CPD_Q3 Word Count,CPD_Q1 Clean Word Count,CPD_Q3 Clean Word Count
0,The class time devoted to health and wellness ...,I believe self-care to be really important in ...,"[class, time, devot, health, well, self, care,...","[believ, self, care, realli, import, live, hea...",88,37,42,20
1,The class time devoted to health and wellness ...,The time spent on my choice of self-care allow...,"[class, time, devot, health, well, self, care,...","[time, spent, choic, self, care, allow, relax,...",78,26,39,15
2,The class time has reminded me on the importan...,The time spent on my choice of self care affec...,"[class, time, remind, import, self, care, also...","[time, spent, choic, self, care, affect, posit...",91,63,39,30
3,The class time that was devoted to health and ...,It made me more relaxed and less stressed abou...,"[class, time, devot, health, well, self, care,...","[made, relax, le, stress, upcom, futur, exam, ...",38,34,16,13
4,I have learnt to listen to people without inte...,This has been a lifeline because I always felt...,"[learnt, listen, peopl, without, interrupt, le...","[lifelin, alway, felt, like, wast, time, walk,...",58,50,25,23


## EDA

In [136]:
print("CPD_Q1 raw word count: ", np.average(data['CPD_Q1 Word Count']))
print("CPD_Q3 raw word count: ", np.average(data['CPD_Q3 Word Count']))

print("CPD_Q1 clean word count: ", np.average(data['CPD_Q1 Clean Word Count']))
print("CPD_Q3 clean word count: ", np.average(data['CPD_Q3 Clean Word Count']))

CPD_Q1 raw word count:  62.74074074074074
CPD_Q3 raw word count:  40.138888888888886
CPD_Q1 clean word count:  29.77777777777778
CPD_Q3 clean word count:  19.037037037037038


### Miscellaneous

In [None]:
"""
Potential stop words to include considering the context
of our project
"""

# stopwords.append('pharmacy')
# stopwords.append('pharmacist')
# stopwords.append('pharmacists')
# stopwords.append('health')
# stopwords.append('healthcare')
# stopwords.append('practitioner')
# stopwords.append('mental')
# stopwords.append('wellness')
# stopwords.append('self-care')
# stopwords.append('mentor')
# stopwords.append('WIP')
# stopwords.append('class')
# stopwords.append('school')
# stopwords.append('peers')
# stopwords.append('classmates')
# stopwords.append('quarter')
# stopwords.append('spring')
# stopwords.append('winter')
# stopwords.append('fall')

'\nPotential stop words to include considering the context\nof our project\n'

In [None]:
# Split Q1 and Q3 into different dfs
"""
data_Q1 = data[["CPD_Q1"]]
data_Q1 = data_Q1.astype(str)

data_Q3 = data[["CPD_Q3"]]
data_Q3 = data_Q3.astype(str)
"""

'\ndata_Q1 = data[["CPD_Q1"]]\ndata_Q1 = data_Q1.astype(str)\n\ndata_Q3 = data[["CPD_Q3"]]\ndata_Q3 = data_Q3.astype(str)\n'