In [5]:
import os
import time
import re
import os
import slack 
from slack_sdk import WebClient
from slack_sdk import rtm # Real Time Messaging Client

from slack_sdk.errors import SlackApiError
import pandas as pd
import numpy as np

#NLTK
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import wordnet
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stopwords = stopwords.words('english')
from nltk import word_tokenize, sent_tokenize

#SKLEARN
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf

#TENSORFLOW
import tensorflow as tf

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### CONNECT THE SLACKBOT BY THIS SECRET TOKEN 

In [6]:
SLACK_BOT_TOKEN = '<bot-token>'

slack_clients = WebClient(token=SLACK_BOT_TOKEN)
rtmclient = slack.RTMClient(token=SLACK_BOT_TOKEN)


In [8]:
rtmclient

<slack_sdk.rtm.RTMClient at 0x7f8ab9032610>

### DATA: QUESTIONS AND RESPONSES

In [9]:
pd.set_option('display.max_colwidth', -1)
data = pd.read_csv('./coffee.csv')
data.ffill(axis=0, inplace= True)
context = data['Context'].values
context_string = data['Context'].str.cat(sep='\n')
data.head()

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Context,Text Response
0,What is the best temperature to brew coffee?,"According to chemical studies, the optimal water temperature for drip coffee is 95-98C. According to my notes, colder water doesn't extract enough caffeine/essential oils from the beans, and above such temperature the acidity increases wildly."
1,Quality of coffee,"The quality of a brew depends on the following factors (in no particular order):\nTime since grinding the beans.\nTime since roasting.\nCleanliness with brewing equipment.\nBean quality (what crop, etc.).\nWater quality.\n"
2,What is the difference between arabica and robusta?,"Arabica beans and robusta beans are two different species of coffee. They are the primary species of coffee that find their way into the American cup. The general differences are those of taste, and the conditions under which the two species differ in production."
3,Just how much ground coffee do I need for x amount of coffee?,a. Whatever seems right to you. b. It may change slightly from coffee to coffee and according to freshness.
4,What are the different between Preparation Methods\n,Drip\nFrench Press\nEspresso\nPercolator


### Text Preprocessing

In [10]:
def nltk_cleaning(text):
  token_text = word_tokenize(text)
  clean_text = ["unk"]
  lemma = wordnet.WordNetLemmatizer()
  tag_list = pos_tag(token_text, tagset=None)
  for token, pos_token in tag_list:
   if token not in '\n\n \n\n\n!"-#$%&()--.*''+,-/:;``<=>[``?@[\\]^_`''{|}~\t\n`\'\'' and (token not in stopwords):
     if pos_token.startswith('V'):  # Verb
         pos_val='v'
     elif pos_token.startswith('J'): # Adjective
         pos_val='a'
     elif pos_token.startswith('R'): # Adverb
         pos_val='r'
     else:
         pos_val='n' # Noun
     lemma_token= lemma.lemmatize(token,pos_val)
     clean_text.append(lemma_token.lower())
   else:
      continue 
  return " ".join(clean_text)
data['nltk_cleaning']= data['Context'].apply(nltk_cleaning)
data['nltk_cleaning'].head()

0    unk what best temperature brew coffee             
1    unk quality coffee                                
2    unk what difference arabica robusta               
3    unk just much ground coffee i need x amount coffee
4    unk what different preparation methods            
Name: nltk_cleaning, dtype: object

#### EMBEDDING DIMENSIONS

In [11]:
####word embeddings 
EMBEDDING_DIM = 300

# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('./glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


#### TOKENIZATION

In [12]:
sentences = data['nltk_cleaning'].fillna("DUMMY_VALUE").values
tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
print("length of sequences",len(sequences))
max_len = [len(s) for s in sentences]
#max_index = np.argmax(max_len)
#print(max_len[max_index])
print(sequences)
print(len(tokenizer.index_word))
token_words = [[tokenizer.index_word[i] for i in j] for j in sequences]
print(token_words)
data['question_tokens']=token_words

length of sequences 27
[[1, 3, 17, 18, 9, 2], [1, 19, 2], [1, 3, 10, 20, 21], [1, 22, 5, 23, 2, 24, 25, 26, 27, 2], [1, 3, 6, 28, 29], [1, 30, 5, 2], [1, 31, 2], [1, 3, 32, 2, 11], [1, 3, 6, 11, 33], [1, 4, 5, 7, 8, 2], [1, 4, 12, 34, 8, 2], [1, 3, 35, 36, 2], [1, 37, 2, 13], [1, 38, 14, 2, 39], [1, 4, 40, 7, 41, 2], [1, 42, 2, 43], [1, 4, 2, 44], [1, 3, 10, 45, 46, 2], [1, 47, 2, 48, 49], [1, 3, 2, 7], [1, 4, 50, 2, 51], [1, 4, 12, 8, 2, 52, 14, 53], [1, 3, 54], [1, 3, 55, 56, 9, 6, 57, 2], [1, 15, 2, 58, 16, 13], [1, 15, 2, 59, 16, 60], [1, 61, 62, 2, 63]]
63
[['unk', 'what', 'best', 'temperature', 'brew', 'coffee'], ['unk', 'quality', 'coffee'], ['unk', 'what', 'difference', 'arabica', 'robusta'], ['unk', 'just', 'much', 'ground', 'coffee', 'i', 'need', 'x', 'amount', 'coffee'], ['unk', 'what', 'different', 'preparation', 'methods'], ['unk', 'effects', 'much', 'coffee'], ['unk', 'varieties', 'coffee'], ['unk', 'what', 'mean', 'coffee', 'roast'], ['unk', 'what', 'different', 'roast',

In [13]:
data['question_tokens']

0     [unk, what, best, temperature, brew, coffee]                 
1     [unk, quality, coffee]                                       
2     [unk, what, difference, arabica, robusta]                    
3     [unk, just, much, ground, coffee, i, need, x, amount, coffee]
4     [unk, what, different, preparation, methods]                 
5     [unk, effects, much, coffee]                                 
6     [unk, varieties, coffee]                                     
7     [unk, what, mean, coffee, roast]                             
8     [unk, what, different, roast, degree]                        
9     [unk, how, much, caffeine, cup, coffee]                      
10    [unk, how, many, calorie, cup, coffee]                       
11    [unk, what, fair, trade, coffee]                             
12    [unk, is, coffee, bad]                                       
13    [unk, can, drink, coffee, pregnant]                          
14    [unk, how, long, caffeine, last, coffee]  

### DEFINING SOME SENTENCE EMBEDDING FUNCTIONS

In [14]:
#COMPUTING VECTORS FOR THE QUESTIONS
def question_embedding(ss1):
  v1=np.mean([word2vec[word] if word in word2vec else word2vec['unk'] for word in ss1], axis=0)
  #print(v1)
  return v1

def cosine_similarity_sentence(ss1,ss2):
  v1=np.mean([word2vec[word] for word in ss1], axis=0)
  v2=np.mean([word2vec[word] for word in ss2], axis=0)
  cosine_sent = 1- pairwise_distances([v1],[v2], metric = 'cosine' )
  return cosine_sent[0]*100

In [15]:
que_list = [question_embedding(x) for x in data['question_tokens']]
que_array = np.array(que_list)
que_array.shape

(27, 300)

In [16]:
#assert('UNK' in word2idx)
word2vec['unk'][0:10]

array([ 0.30071 , -0.46867 , -0.20617 , -0.80978 , -0.23889 ,  0.24329 ,
        0.016538, -0.035687, -0.22306 ,  0.95189 ], dtype=float32)

In [17]:
X= np.array([str(i)+"_vector" for i in range(100)])
X = X.reshape(1,100)
print(len(que_array))
match_frame = pd.DataFrame(que_array)
match_frame.shape

27


(27, 300)

In [18]:
match_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.102502,0.17515,0.065827,-0.148238,-0.327283,0.180668,0.005131,0.100618,0.100321,-0.716597,...,-0.104339,0.157619,-0.077268,0.09015,0.083713,-0.128634,-0.027135,-0.116385,-0.288812,0.405632
1,0.076347,0.214803,0.02259,-0.203693,-0.361413,0.288328,0.208896,0.083021,0.009539,-0.646027,...,-0.060357,0.244567,-0.331037,-0.016966,-0.047183,0.125363,0.192513,-0.123237,-0.193994,-0.00981
2,-0.28402,-0.108906,0.137059,-0.066694,-0.187683,0.359326,0.116956,-0.238365,-0.136817,-0.536491,...,-0.11433,0.20922,-0.041963,0.091886,-0.205666,0.181544,0.111194,-0.081522,-0.220932,0.055634
3,-0.240735,0.099508,0.026817,-0.070115,-0.247514,0.227813,0.126356,-0.065146,0.114613,-1.37209,...,-0.019242,-0.002697,-0.07246,0.007849,0.170958,-0.092773,0.038488,-0.185478,-0.069213,0.151109
4,-0.165602,0.063018,0.225127,-0.284632,-0.012935,0.018933,-0.177189,0.064056,-0.061336,-1.288962,...,-0.067929,-0.188518,0.001878,0.220693,0.009107,0.054296,-0.013229,-0.056367,-0.084657,-0.061982


### Text Vectorization 

pairwise distance provide distance between two array.so more pairwise distance means less similarity.while cosine similarity is 1-pairwise_distance so more cosine similarity means more similarity between two arrays.

In [19]:
sent = 'how many cups can i have in a day'
Question = nltk_cleaning(sent).split(' ')
q_array = np.array(Question)
Question_tf = question_embedding(q_array)
Question_tf=Question_tf.reshape(-1,EMBEDDING_DIM)

In [20]:
cosine_value_tf = 1- pairwise_distances(match_frame, Question_tf, metric = 'cosine' )
index_value = cosine_value_tf.argmax() 
answer = data['Text Response'].iloc[index_value]
answer

'On average, the regular person is safe drinking 3-5 cups of coffee a day. Ultimately, its up to you to understand how your body responds to coffee, and to judge when you shouldn’t have any more. If you respond well to large amounts of caffeine, and don’t become jittery or get stomach pain, then you can drink what you want, but if you know that drinking lots of coffee keeps you up at night and leads to headaches in the morning, cut down to 3 cups a day. Drinking more than 5 cups of coffee a day has been linked to a raise in cholesterol, so if you’re a heavy coffee drinker sticking to 5 is the recommended amount.'

### Parsing the Incoming Commands from Slack

#### How to interpret the incoming message from Slack and send appropriate responses

In [21]:
import slack
RTM_READ_DELAY=30

@slack.RTMClient.run_on(event='message')
def coffee_helper_bot(**payload):
    user_data = payload['data']
    #print(data)
    command =  user_data['text']
    channel_id =  user_data['channel']
    thread_ts =  user_data['ts']
    user =  user_data['user']
    event_subtype = user_data.get('subtype')
    print(event_subtype)
    

    # Return data to only authorised users
    accepted_users = ['UHYFFGK7S']
    ending_text = ['bye','done', 'thanks', 'exit', 'ok', 'x']
    
    command = user_data['text']
    #print(command)
    
    #Check if the user is authorised to use the bot
    if user not in accepted_users:
        response = "Not an authorised user"
    elif command in ending_text:
             response = "Bye! Thanks for chatting"
    else:
        #preprocess the command and clean it
        Question = nltk_cleaning(command).split(' ')
        #convert the question to array
        q_array = np.array(Question)
        #convert the question to embedding
        Question_tf = question_embedding(q_array)
        #reshape the question to 1,300
        Question_tf=Question_tf.reshape(-1,EMBEDDING_DIM)
        #calculate the cosine similarity
        cosine_value_tf = 1- pairwise_distances(match_frame, Question_tf, metric = 'cosine' )
        #get the highest value for the similarity
        value = cosine_value_tf.max()
        print(f' cosine value {value}')
        if value > 0.3:
            #get the index of the highest value
            index_value = cosine_value_tf.argmax() 
        #   get the answer from the dataframe
            response = data['Text Response'].iloc[index_value]
        else:
            response = "Sorry, I don't understand the question"
        #print(response)

    
    webclient = payload['web_client']
    # Send the response back to the channel where the message was posted
    webclient.chat_postMessage(
            channel=channel_id,
            thread_ts=thread_ts, 
            text=response,
            icon_emoji=':coffee:'
        )
    time.sleep(RTM_READ_DELAY)



In [23]:
# Start the RTM client and call the 
#rtmclient.start()


In [24]:
#rtmclient.stop()