In [2]:
# Import dependencies 
import numpy as np
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Set the column width
pd.set_option('max_colwidth', 200)

In [3]:
# Load the customer feedback dataset.
csat_reviews_df = pd.read_csv('customer_comments_data.csv')
# Display the first five rows of the dataset. 
csat_reviews_df.iloc[[28, 48, 1239]]

Unnamed: 0,comment,label
28,Very clean.,positive
48,Everything fine,positive
1239,I'm appalled by the lack of basic hygiene practices. It's no wonder infections are rampant in this facility.,negative


In [4]:
# Get the info on the DataFrame
csat_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1242 entries, 0 to 1241
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  1242 non-null   object
 1   label    1242 non-null   object
dtypes: object(2)
memory usage: 19.5+ KB


In [5]:
# Remove digits and non-alphabetic characters
csat_reviews_df['comment'] = csat_reviews_df['comment'].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))
csat_reviews_df.iloc[[28, 48, 1239]]

Unnamed: 0,comment,label
28,Very clean,positive
48,Everything fine,positive
1239,Im appalled by the lack of basic hygiene practices Its no wonder infections are rampant in this facility,negative


In [6]:
# Create an instance of the CountVectorizer and set the max_df to 0.95 and min_df to 10, and use the "english" stopwords.
cv = CountVectorizer(max_df=0.95,min_df=10, stop_words='english')
cv

In [7]:
# Extract the comments.
comments = csat_reviews_df['comment']

In [8]:
# Transform each row from the headlines Series to a DTM.
dtm = cv.fit_transform(comments)
# Get the shape of the DTM.
print(dtm.shape)

(1242, 267)


In [9]:
# Look at 100 random words in the vocabulary
print(cv.get_feature_names_out()[:100])

['abcabc' 'able' 'absolutely' 'admitted' 'allowed' 'amazing' 'ambulance'
 'answered' 'appointment' 'appointments' 'area' 'arrived' 'ask' 'asked'
 'assistant' 'attentive' 'available' 'away' 'awesome' 'baby' 'bad'
 'bathroom' 'bed' 'best' 'better' 'birth' 'blood' 'busy' 'button' 'called'
 'came' 'care' 'cared' 'caring' 'case' 'center' 'change' 'charge' 'check'
 'clean' 'clinic' 'cold' 'come' 'comfortable' 'coming' 'communication'
 'compassionate' 'concerns' 'condition' 'contacted' 'courteous' 'covid'
 'cpmc' 'day' 'days' 'delays' 'delivery' 'did' 'didnt' 'different'
 'difficult' 'discharge' 'discharged' 'doctor' 'doctors' 'doing' 'dont'
 'dr' 'early' 'efficient' 'emergency' 'entire' 'er' 'especially' 'exam'
 'excellent' 'exceptional' 'experience' 'explained' 'extremely' 'facility'
 'family' 'fantastic' 'far' 'feel' 'feeling' 'felt' 'floor' 'food'
 'friendly' 'gave' 'given' 'god' 'going' 'good' 'got' 'grateful' 'great'
 'hands' 'happy']


In [10]:
# Print the first 500 elements (transformed words) from the 1st row, i.e., document. 
print(dtm.toarray()[0][:500])

[0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]


In [11]:
# Get the feature names (words) from the CountVectorizer
feature_names = cv.get_feature_names_out()

# Get all the non-zero elements from the first row.
non_zero_elements = dtm.toarray()[0]

# Get the indices for each non-zero element.
non_zero_indices = non_zero_elements.nonzero()[0]

# Print out the word and the number of times the word is in the row. 
for idx in non_zero_indices:
    print(f"Word: {feature_names[idx]} | Word index {idx} | Count = {non_zero_elements[idx]}")

Word: arrived | Word index 11 | Count = 1
Word: ask | Word index 12 | Count = 1
Word: discharge | Word index 61 | Count = 2
Word: discharged | Word index 62 | Count = 1
Word: early | Word index 68 | Count = 1
Word: felt | Word index 86 | Count = 1
Word: help | Word index 105 | Count = 2
Word: time | Word index 233 | Count = 1
Word: told | Word index 236 | Count = 1
Word: use | Word index 246 | Count = 1


In [12]:
# Convert the DTM to a DataFrame
dtm_df = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())

# Display some random columns and the first 20 rows of the DataFrame.
dtm_df.iloc[:,180:195:].head(10)

Unnamed: 0,problem,procedure,process,professional,provided,provider,pt,questions,quickly,really,receive,received,recommend,recovery,remember
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Applying LDA

In [13]:
# Pick 5 topics to start with `n_components=5`
LDA = LatentDirichletAllocation(n_components=5,random_state=42)
# Fit the model with our DTM data. 
LDA_data = LDA.fit(dtm)

In [14]:
# Get the values of each topic-word distribution.
topic_word_distributions = LDA.components_
print(topic_word_distributions)

[[ 0.44921094  5.45091431  0.20074058 ...  0.20018923  0.20014904
   0.20226923]
 [ 0.20184266  0.20501616  0.20345333 ...  2.31796653  0.20198058
   2.23192836]
 [54.22548115  0.20383285  6.16973625 ...  0.20529503  6.28520467
   0.20056306]
 [36.85053687  0.20054937  5.2260624  ...  0.51050314  0.20519834
   6.26136112]
 [29.27292838 17.9396873   0.20000744 ...  9.76604607  6.10746737
  13.10387823]]


In [15]:
# Get the length of the array of each topic. It should be the same as the vocabulary.
for index,topic in enumerate(LDA.components_):
    print(len(LDA.components_[index]))

267
267
267
267
267


In [16]:
# Get the array of the first topic 
first_topic = LDA.components_[0]
# This is the ranking of each word in the array. Lower values have less impact than higher values.
print(first_topic)

[  0.44921094   5.45091431   0.20074058   8.32999253   6.00031345
   0.20087186   0.20199582   0.20001409  13.04602386   0.20193661
   4.30644209   3.99657025  21.99907124  10.87816211   0.20143882
   1.79176988   5.40384733   7.86460034   2.0531374    0.20265053
  18.24619842   0.20458481   0.20565173   0.20031964   4.16129066
   0.20117048   0.20175515   2.50951725   1.53475845   3.98344283
   7.34417209  13.5044052    0.2005076    0.64579698   2.113084
   2.25660772   0.20247652   0.20110895   7.88681201   2.0993546
   0.20895579   0.2018889    4.52113882   0.20023097   2.85553373
   9.94240893   0.20096462   0.20002917   0.20048538   2.83770181
  10.65926195  12.18125747   2.60921974  18.59320071  12.41389968
   0.2003943    0.20219222   0.20891454  22.30734391   6.02649274
   2.48762291  14.33895749   7.15833791  17.92531659   5.11009264
   0.20124495   0.20100256   0.2008546    0.20084111  12.58585645
  18.35262958   0.25962776  34.95407684   0.20146      4.18825872
   0.2008076 

In [17]:
# Get the indices for the first topic in descending order.
sorted_first_topic_indices = np.argsort(-first_topic)

# Use the sorted indices to the values from greatest to least.
sorted_first_topic_values = first_topic[sorted_first_topic_indices]
for value in sorted_first_topic_values:
    print(value)

100.56256946942689
78.078362681168
63.195305718583036
49.88003594313552
40.3605585448495
37.766905886255614
35.51119708000918
34.95407683533763
33.05578117301026
33.03656085359027
29.194503315929236
23.771861546661864
23.394042439066745
22.514242609604025
22.30734390590135
22.193633584833282
22.15546404677068
21.999071238731034
21.387900832763442
20.576510702809088
18.59320070970238
18.352629580852504
18.246198416085267
18.189461201046072
17.92531659441272
16.078752882128807
15.517125029043209
15.473579980635671
15.217510176644822
14.873183358380977
14.70861240623702
14.403432840880297
14.338957493539358
14.037526813227956
13.609028431227049
13.504405196875839
13.046023861128356
12.737785115872711
12.725556060668596
12.5858564467259
12.413899682253318
12.221917347327262
12.215421514410504
12.181257467573957
11.96410060898595
11.710640490643627
11.411140324566789
11.203645254700941
10.933063362493051
10.878162112702924
10.734103908753134
10.706827657156785
10.659261950387034
10.61822438

In [18]:
# Define an array of values index 0 = 10, index 1 = 200, index 2 = 1.
arr = np.array([10, 200, 1])
# Print out the indices after sorting the array from least to greatest, i.e., 1, 10, 200:
print(f"The indices the the array, '10, 200, 1' from least to greatest: {np.argsort(arr)}")
# Reverse the sort from greatest to least. 
print(f"The indices the the array, '10, 200, 1' from greatest to least: {np.argsort(-arr)}")

The indices the the array, '10, 200, 1' from least to greatest: [2 0 1]
The indices the the array, '10, 200, 1' from greatest to least: [1 0 2]


In [19]:
# Sort the array of the first topic
first_topic.argsort()

array([175,   7, 103,  47,  92, 265,  80, 264, 241,  43, 167, 118,  23,
       254, 123, 140,  55, 223,  96, 230,  83,  48, 262, 188,  32,  98,
       117,  76, 122,   2, 228,  75,  68,  67, 203,   5, 207,  85, 165,
        46, 119, 229, 189,  66,  37,  25,  82,  65, 131,  14,  73, 190,
       137, 206,  97, 150, 209, 126, 210, 244, 208, 170, 234, 184,  26,
       197, 192,  94, 138,  41,  95,   9, 211, 148,   6, 248, 187, 180,
        56, 266, 178, 227,  36, 161, 149, 242, 260,  19, 146, 226, 240,
       257, 159, 135, 145, 114, 258, 120,  77,  86, 263, 174, 199,  21,
       224,  22, 128, 252, 142, 261,  57, 183,  40,  71,   0,  33, 177,
       107, 127, 194, 141, 129, 132, 116, 182,  28, 156,  15, 193, 106,
        18,  39, 102,  34,  35,  78, 158, 100, 166,  60, 185,  27, 202,
        52, 212, 191,  89,  49,  44, 218, 232, 121,  90, 147, 259,  79,
        29,  11, 221,  24,  74, 214, 213, 179,  10, 217, 171, 157, 196,
        42, 195, 169, 139, 144, 247,  64, 204, 108,  16,   1, 13

In [20]:
# Get the value of the word that is least representative of this topic
print(f"The value of the word that is least representative of this topic is: {first_topic[175]}")
# Get the value of the word that is most representative of this topic
print(f"The value of the word that is most representative of this topic is: {first_topic[198]}")

The value of the word that is least representative of this topic is: 0.20001153011074146
The value of the word that is most representative of this topic is: 100.56256946942689


In [21]:
# Get the indices of the top ten words for the first topic (e.g., top 10 words for topic 0):
top_word_indices = first_topic.argsort()[-10:][::-1]
print(top_word_indices)

[198 233 249 168 111 220 236  72 154 143]


In [22]:
# Get the top ten words from the indices. 
for index in top_word_indices:
    print(cv.get_feature_names_out()[index])

room
time
wait
pain
hours
surgery
told
er
minutes
long


In [23]:
# Get the bottom ten words from the indices.
bottom_word_indices = first_topic.argsort()[:10][::-1]
for index in bottom_word_indices:
    print(cv.get_feature_names_out()[index])

comfortable
truly
xxxx
facility
years
god
concerns
healthcare
answered
pleasant


In [24]:
# Print the top 20 words for each topic
for index,topic in enumerate(LDA.components_):
    print(f"The Top 20 Words For Topic #{index+1}")
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-20:][::-1]])
    print('\n')

The Top 20 Words For Topic #1
['room', 'time', 'wait', 'pain', 'hours', 'surgery', 'told', 'er', 'minutes', 'long', 'waiting', 'took', 'times', 'family', 'didnt', 'medication', 'pm', 'ask', 'wasnt', 'meds']


The Top 20 Words For Topic #2
['medical', 'staff', 'appointment', 'patients', 'early', 'dr', 'delays', 'taken', 'treatment', 'right', 'room', 'seen', 'arrived', 'appointments', 'clinic', 'away', 'covid', 'good', 'felt', 'short']


The Top 20 Words For Topic #3
['great', 'staff', 'hospital', 'abcabc', 'experience', 'best', 'stay', 'need', 'time', 'good', 'nursing', 'care', 'nurses', 'team', 'professional', 'thank', 'doctors', 'kind', 'visit', 'felt']


The Top 20 Words For Topic #4
['care', 'nurses', 'excellent', 'thank', 'staff', 'good', 'like', 'nurse', 'hospital', 'doctors', 'abcabc', 'received', 'hands', 'know', 'job', 'caring', 'helpful', 'better', 'dont', 'shift']


The Top 20 Words For Topic #5
['nurse', 'hospital', 'did', 'asked', 'day', 'called', 'told', 'doctor', 'nurses'

### Taking our best guess at the topics.
---
- TOPIC 1: **Quality of Care**
- TOPIC 2: **Clinic/Hospital Operations**
- TOPIC 3: **Care and Courtesy**
- TOPIC 4: **Doctors and Nurses**
- TOPIC 5: **Follow Up and Care**

In [25]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).
topic_results = LDA.transform(dtm)

# Get the shape of the topic results
topic_results.shape

(1242, 5)

In [26]:
# Get the first headline's topic probability distribution rounded to 6 decimal places. 
print(topic_results[0].round(6))

[0.704913 0.248115 0.015625 0.015524 0.015822]


In [27]:
# Get the sorted indices for each topic in the first headline.
sorted_indices = np.argsort(-topic_results[0])
# Print the ranking of topics for the headline
print("Ranking of topics for the first headline:")
for rank, topic_index in enumerate(sorted_indices):
    print(f"   Rank {rank+1}: Topic {topic_index+1}, Probability: {topic_results[0, topic_index]:.6f}")

Ranking of topics for the first headline:
   Rank 1: Topic 1, Probability: 0.704913
   Rank 2: Topic 2, Probability: 0.248115
   Rank 3: Topic 5, Probability: 0.015822
   Rank 4: Topic 3, Probability: 0.015625
   Rank 5: Topic 4, Probability: 0.015524


In [28]:
# Get the topic with the highest probability. 
topic_results[0].argmax()+1

np.int64(1)

In [42]:
# Read in our original feedback. 
customer_feedback = pd.read_csv('customer_comments_data.csv')
# Combine the original data with the topic label. 
customer_feedback['topic'] = (topic_results.argmax(axis=1)+1)

In [47]:
# Get a few rows. 
customer_feedback.iloc[1239, [0, 2]]

comment    I'm appalled by the lack of basic hygiene practices. It's no wonder infections are rampant in this facility.
topic                                                                                                                 2
Name: 1239, dtype: object

### The comment about hygiene, infections, etc., was correctly classified under the topic "Clinic/Hospital Operations"!

## Applying NMF

In [77]:
# Import the dependencies 
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Set the column width to 200.
pd.set_option('max_colwidth', 200)

In [78]:
# Load the customer feedback dataset.
csat_reviews_df = pd.read_csv('customer_comments_data.csv')
# Display the first five rows of the dataset. 
csat_reviews_df.iloc[[28, 48, 1239]]

Unnamed: 0,comment,label
28,Very clean.,positive
48,Everything fine,positive
1239,I'm appalled by the lack of basic hygiene practices. It's no wonder infections are rampant in this facility.,negative


In [80]:
# Remove digits and non-alphabetic characters
csat_reviews_df['comment'] = csat_reviews_df['comment'].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))
csat_reviews_df.iloc[[28, 48, 1239]]

Unnamed: 0,comment,label
28,Very clean,positive
48,Everything fine,positive
1239,Im appalled by the lack of basic hygiene practices Its no wonder infections are rampant in this facility,negative


In [81]:
# Create an instance of the TfidfVectorizer and set the max_df to 0.95 and min_df to 10, and use the English stopwords to be ignored.
tfidf = TfidfVectorizer(max_df=0.95, min_df=10, stop_words='english')
tfidf

In [82]:
# Transform each row from the headlines Series to a DTM.
dtm = tfidf.fit_transform(csat_reviews_df["comment"])
# Get the shape of the DTM.
print(dtm.shape)

(1242, 267)


In [83]:
# Get the feature names (words) from the TfidfVectorizer
feature_names = tfidf.get_feature_names_out()

# Get all the non-zero elements from the first row.
non_zero_elements = dtm.toarray()[0]

# Get the indices for each non-zero element.
non_zero_indices = non_zero_elements.nonzero()[0]

# Print out the word and the number of times the word is in the row. 
for idx in non_zero_indices:
    print(f"Word: {feature_names[idx]} | Word index {idx} | Value = {non_zero_elements[idx]}")

Word: arrived | Word index 11 | Value = 0.27714745119362444
Word: ask | Word index 12 | Value = 0.27235725877120304
Word: discharge | Word index 61 | Value = 0.5165349816735995
Word: discharged | Word index 62 | Value = 0.2946791200681169
Word: early | Word index 68 | Value = 0.2701099065790597
Word: felt | Word index 86 | Value = 0.2228384517045112
Word: help | Word index 105 | Value = 0.47071365461802966
Word: time | Word index 233 | Value = 0.18657752742305045
Word: told | Word index 236 | Value = 0.21111656689995012
Word: use | Word index 246 | Value = 0.26795070890343614


In [84]:
# Initialize the NMF and set the number of topics to 5. 
nmf_model = NMF(n_components=5,random_state=42)
# Fit the model with our DTM data. 
nmf_model.fit(dtm)

In [85]:
# Get the length of the array of each topic. It should be the same as the vocabulary.
for index,topic in enumerate(nmf_model.components_):
    print(len(nmf_model.components_[index]))

267
267
267
267
267


In [86]:
# Get the array of the first topic 
first_topic = nmf_model.components_[0]
# This is the ranking of each word in the array. Lower values have less impact than higher values.
print(first_topic)

[0.48233705 0.09033344 0.05462598 0.1247104  0.06741313 0.1316408
 0.08862304 0.02990224 0.         0.         0.10207327 0.07705369
 0.11376133 0.26970156 0.05939011 0.07185104 0.10065423 0.06383091
 0.09901451 0.07850853 0.1761303  0.14191666 0.19435606 0.36295605
 0.0724326  0.05258181 0.19025232 0.02944436 0.09707217 0.18744058
 0.19260972 0.01470131 0.13039386 0.13833834 0.05907804 0.0848446
 0.06472282 0.08798536 0.09999666 0.08088385 0.03435543 0.07477596
 0.17745388 0.09257704 0.07543793 0.07316694 0.04140386 0.03064081
 0.08449179 0.06001162 0.05657981 0.27102243 0.05542686 0.31221131
 0.19874564 0.         0.08983186 0.51991107 0.26304394 0.04061708
 0.05919657 0.20456417 0.12401989 0.33776199 0.32803539 0.04326669
 0.15731585 0.11750146 0.08435521 0.01521484 0.19417087 0.09901197
 0.3132612  0.07882136 0.05095896 0.         0.01777457 0.274551
 0.09899772 0.09004717 0.05163351 0.19496192 0.0262286  0.03619437
 0.15620782 0.06150429 0.31479504 0.16066848 0.16452563 0.09235432

In [87]:
# Get the indices of the top ten words for the first topic (e.g., top 10 words for topic 0):
top_word_indices = first_topic.argsort()[-10:][::-1]
print(top_word_indices)

[215 109 163 164 198  57   0 216 236 139]


In [88]:
# Get the top ten words from the indices. 
for index in top_word_indices:
    print(tfidf.get_feature_names_out()[index])

staff
hospital
nurse
nurses
room
did
abcabc
stay
told
like


In [89]:
# Print the top 20 words for each topic
for index,topic in enumerate(nmf_model.components_):
    print(f'The top 30 words for topic #{index+1}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-30:]])
    print('\n')

The top 30 words for topic #1
['treated', 'home', 'help', 'thank', 'left', 'didnt', 'surgery', 'asked', 'covid', 'need', 'experience', 'pain', 'day', 'er', 'felt', 'doctors', 'night', 'doctor', 'patient', 'best', 'like', 'told', 'stay', 'abcabc', 'did', 'room', 'nurses', 'nurse', 'hospital', 'staff']


The top 30 words for topic #2
['hands', 'home', 'abcabc', 'center', 'er', 'love', 'making', 'arrived', 'make', 'work', 'best', 'nurse', 'people', 'god', 'help', 'short', 'ive', 'medical', 'timely', 'thank', 'results', 'staff', 'doctors', 'delays', 'team', 'really', 'job', 'service', 'experience', 'good']


The top 30 words for topic #3
['professional', 'manner', 'team', 'felt', 'amazing', 'timely', 'really', 'experience', 'especially', 'wonderful', 'given', 'recommend', 'help', 'floor', 'knowledgeable', 'nurses', 'nursing', 'rn', 'pleasant', 'communication', 'person', 'people', 'work', 'helped', 'abcabc', 'service', 'thank', 'care', 'nurse', 'great']


The top 30 words for topic #4
['jus

### Taking our best guess at the topics.
---
- TOPIC 1: **Clinic/Hospital Operations**
- TOPIC 2: **Care and Courtesy**
- TOPIC 3: **Quality of Care**
- TOPIC 4: **Doctors and Nurses**
- TOPIC 5: **Follow Up and Care**

In [90]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).
topic_results = nmf_model.transform(dtm)

# Get the shape of the topic results
topic_results.shape

(1242, 5)

In [91]:
# Get the sorted indices for each topic in the first headline.
sorted_indices = np.argsort(-topic_results[0])
# Print the ranking of topics for the headline
print("Ranking of topics for the first headline:")
for rank, topic_index in enumerate(sorted_indices):
    print(f"   Rank {rank+1}: Topic {topic_index+1}, Probability: {topic_results[0, topic_index]:.6f}")

Ranking of topics for the first headline:
   Rank 1: Topic 4, Probability: 0.058633
   Rank 2: Topic 1, Probability: 0.052212
   Rank 3: Topic 2, Probability: 0.001671
   Rank 4: Topic 3, Probability: 0.000425
   Rank 5: Topic 5, Probability: 0.000000


In [92]:
# Read in our original feedback. 
customer_feedback = pd.read_csv('customer_comments_data.csv')
# Combine the original data with the topic label. 
customer_feedback['topic'] = (topic_results.argmax(axis=1)+1)

In [93]:
# Get a few rows. 
customer_feedback.iloc[1239, [0, 2]]

comment    I'm appalled by the lack of basic hygiene practices. It's no wonder infections are rampant in this facility.
topic                                                                                                                 1
Name: 1239, dtype: object

### The comment about hygiene, infections, etc., was correctly classified under the topic "Clinic/Hospital Operations"!