In [1]:
# refer here: http://robertorocha.info/using-nlp-to-analyze-open-ended-responses-in-surveys/
import spacy
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1850000

import pandas as pd
df = pd.read_csv('Data/CrisisLogger/crisislogger.csv')
new_df=df.drop(columns='upload_id').rename(columns={'transcriptions':'DATA_COLUMN'})
corpus = new_df['DATA_COLUMN']

2021-11-23 10:42:46.825532: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-23 10:42:46.825558: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:

all_text = corpus.str.cat(sep=' ')

In [4]:
doc = nlp(all_text, disable = ['ner'])

In [5]:
# overall word frequency analysis

from collections import Counter
words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct] 
word_freq = Counter(words) 
word_freq.most_common(20)

[('know', 285),
 ('time', 185),
 ('like', 183),
 ('go', 173),
 ('school', 171),
 ('work', 164),
 ('think', 139),
 ('home', 131),
 ('thing', 122),
 ('feel', 105),
 ('child', 103),
 ('family', 100),
 ('day', 97),
 ('lot', 89),
 ('kid', 87),
 ('friend', 86),
 ('year', 84),
 ('get', 78),
 ('people', 77),
 ('need', 75)]

In [6]:
# what are the most common adjective-noun phrases?

from spacy.matcher import Matcher 
matcher = Matcher(nlp.vocab) 
pattern = [{'POS':'ADJ'}, {'POS':'NOUN'}] 
matcher.add('ADJ_PHRASE', [pattern]) 
matches = matcher(doc, as_spans=True) 
phrases = [] 
for span in matches:
    phrases.append(span.text.lower())
    phrase_freq = Counter(phrases)
phrase_freq.most_common(30)

[('little bit', 19),
 ('mental health', 18),
 ('high school', 14),
 ('next year', 11),
 ('other people', 7),
 ('5th grader', 6),
 ('same time', 6),
 ('biggest fear', 5),
 ('front lines', 5),
 ('more time', 5),
 ('last time', 5),
 ('much time', 5),
 ('fresh air', 4),
 ('difficult time', 4),
 ('last year', 4),
 ('immediate family', 4),
 ('older son', 4),
 ('social media', 4),
 ('biggest fears', 4),
 ('many people', 4),
 ('different ways', 4),
 ('other children', 3),
 ('8th grade', 3),
 ('second wave', 3),
 ('only child', 3),
 ('many ways', 3),
 ('single mother', 3),
 ('few months', 3),
 ('few people', 3),
 ('public school', 3)]

In [7]:
# The most common adjective that follow the phrase “I am” or “I feel”

feel_adj = []
matcher = Matcher(nlp.vocab)
pattern = [{'LOWER' : {'IN' : ['i', 'we']}}, {'OP': '?'}, 
  {'LOWER': {'IN' : ['feel', 'am', "'m", 'are', "'re"]}}, 
  {'OP': '?'}, {'OP': '?'}, {'POS':'ADJ'}]
matcher.add("FeelAdj", [pattern]) 
matches = matcher(doc, as_spans=True) 
for span in matches:
    feel_adj.extend([token.lemma_ for token in span if token.pos_ == 'ADJ'])
Counter(feel_adj).most_common(20)

[('worried', 16),
 ('lucky', 9),
 ('grateful', 9),
 ('sure', 9),
 ('scared', 8),
 ('little', 8),
 ('able', 7),
 ('okay', 6),
 ('healthy', 5),
 ('sad', 4),
 ('fortunate', 4),
 ('concerned', 4),
 ('glad', 4),
 ('thankful', 3),
 ('angry', 3),
 ('single', 3),
 ('mental', 3),
 ('frustrated', 3),
 ('happy', 3),
 ('good', 3)]

In [13]:
# looks for phrases that start with “I/we want/need”, followed by a noun, with optional filler words in between

want_adj = []
matcher = Matcher(nlp.vocab) 
pattern = [{'LOWER' : {'IN' : ['i', 'we']}}, {'IS_ALPHA':True, 'OP':'?'},
   {'LOWER': {'IN' : ['need', 'want']}}, {'IS_ALPHA':True, 'OP':'?'},
   {'IS_ALPHA':True, 'OP':'?'}, {'POS':'NOUN'}]
   
matcher.add("WantPhrase", [pattern]) 
matches = matcher(doc, as_spans=True)
matches



[I would want for our country,
 I need a break,
 I just want alone time,
 we need celebrate a reborn,
 we definitely need some time,
 we need strength,
 I need the biggest struggle,
 I want to my in,
 I want number,
 I want the schools]

In [15]:
# the words that most frequently occur near the phrase “mental health”:
from spacy.matcher import PhraseMatcher
mental_health_colloc = []
matcher = PhraseMatcher(nlp.vocab, attr = 'LOWER') 
# The attr above ensures all instances are converted to lower-case so the search is case-insensitive
pattern = [nlp.make_doc('mental health')]
matcher.add('mentalHealth', pattern) 
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start-10 : end+10]   
    mental_health_colloc.extend([token.lemma_.lower() for token in span if not token.is_stop and not token.is_punct]) 
Counter(mental_health_colloc).most_common(20)

[('worry', 26),
 ('time', 5),
 ('family', 4),
 ('know', 3),
 ('fear', 2),
 ('message', 2),
 ('lately', 2),
 ('worker', 2),
 ('money', 2),
 ('work', 2),
 (' ', 2),
 ('game', 2),
 ('like', 2),
 ('tool', 2),
 ('handle', 2),
 ('feel', 2),
 ('happen', 1),
 ('doubt', 1),
 ("thatwe'll", 1),
 ('come', 1)]