In [54]:
import os
import time
import re
import slackclient
import pandas as pd
import numpy as np

#NLTK
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import wordnet
nltk.download('stopwords')
from nltk import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
stopwords = stopwords.words('english')
from nltk import word_tokenize, sent_tokenize

#SKLEARN
import tensorflow_hub as hub
from sklearn.metrics import pairwise_distances # to perfrom cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer # to perform tfidf

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ankitkothari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### CONNECT THE SLACKBOT BY THIS SECRET TOKEN STARTS WITH 'x...-....')

In [55]:
slack_clients = slackclient.SlackClient('SLACK APP TOKEN')

#### DATA: QUESTIONS AND RESPONSES

In [66]:
pd.set_option('display.max_colwidth', -1)

data = pd.read_csv('./coffee.csv')
data.ffill(axis=0, inplace= True)
context = data['Context'].values
context_string = data['Context'].str.cat(sep='\n')
data.head()

Unnamed: 0,Context,Text Response
0,What is the best temperature to brew coffee?,"According to chemical studies, the optimal water temperature for drip coffee is 95-98C. According to my notes, colder water doesn't extract enough caffeine/essential oils from the beans, and above such temperature the acidity increases wildly."
1,Quality of coffee,"The quality of a brew depends on the following factors (in no particular order):\nTime since grinding the beans.\nTime since roasting.\nCleanliness with brewing equipment.\nBean quality (what crop, etc.).\nWater quality.\n"
2,What is the difference between arabica and robusta?,"Arabica beans and robusta beans are two different species of coffee. They are the primary species of coffee that find their way into the American cup. The general differences are those of taste, and the conditions under which the two species differ in production."
3,Just how much ground coffee do I need for x amount of coffee?,a. Whatever seems right to you. b. It may change slightly from coffee to coffee and according to freshness.
4,What are the different between Preparation Methods\n,Drip\nFrench Press\nEspresso\nPercolator


#### Text Preprocessing

In [67]:
def nltk_cleaning(text):
  token_text = word_tokenize(text)
  clean_text = []
  lemma = wordnet.WordNetLemmatizer()
  tag_list = pos_tag(token_text, tagset=None)
  for token, pos_token in tag_list:
   if token not in '\n\n \n\n\n!"-#$%&()--.*''+,-/:;``<=>[``?@[\\]^_`''{|}~\t\n`\'\'':
     if pos_token.startswith('V'):  # Verb
         pos_val='v'
     elif pos_token.startswith('J'): # Adjective
         pos_val='a'
     elif pos_token.startswith('R'): # Adverb
         pos_val='r'
     else:
         pos_val='n' # Noun
     lemma_token= lemma.lemmatize(token,pos_val)
     clean_text.append(lemma_token.lower())
   else:
      continue 
  return " ".join(clean_text)
data['nltk_cleaning']= data['Context'].apply(nltk_cleaning)
data['nltk_cleaning'].head()

0    what be the best temperature to brew coffee                 
1    quality of coffee                                           
2    what be the difference between arabica and robusta          
3    just how much ground coffee do i need for x amount of coffee
4    what be the different between preparation methods           
Name: nltk_cleaning, dtype: object

#### LOADING IN THE TF SENTENCE ENCODER EMBEDDINGS 

In [68]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings.shape)

(2, 512)


#### ENCODING THE QUESTIONS WITH ENCODER DIMENSION 512

In [69]:
que_list = [embed([x]) for x in data['nltk_cleaning']]
que_array = np.array(que_list)
que_array=que_array.reshape(27,512)

#### CONVERTING IT INTO A DATAFRAME WITH QUESTIONS X DIMENSION MATRIX

In [70]:
X= np.array([str(i)+"_vector" for i in range(512)])
X = X.reshape(1,512)
print(len(que_array))
match_frame = pd.DataFrame(que_array)
match_frame.shape

27


(27, 512)

In [71]:
sent = 'how many cups can i have in a day'
Question = nltk_cleaning(sent)
print(type(Question))
print(Question)
Question_tf = embed([Question])
Question_tf = np.array(Question_tf)
Question_tf= Question_tf.reshape(-1,512)
print(Question_tf.shape)
print(Question_tf)

<class 'str'>
how many cup can i have in a day
(1, 512)
[[-3.88510115e-02 -6.62912577e-02 -2.90930793e-02  9.06389579e-03
   4.16994728e-02  5.13799228e-02 -2.38981564e-02  2.01780125e-02
  -3.96104939e-02  1.70970932e-02 -2.47557666e-02 -1.89401098e-02
  -1.99111030e-02 -2.19903663e-02 -8.72539729e-02  2.55828705e-02
  -1.35452384e-02 -2.31218990e-02 -9.72076203e-04 -2.82054357e-02
   3.23695391e-02 -1.68545209e-02 -1.97919700e-02 -4.15924983e-03
  -1.99679583e-02 -1.17185013e-02 -3.50795723e-02 -5.35474122e-02
   4.93184291e-03  4.71282639e-02  1.68616567e-02  5.76052666e-02
   2.89916079e-02  2.07012128e-02  4.47171852e-02  3.26159187e-02
   6.54935315e-02  4.01345156e-02  2.22950093e-02 -7.53672868e-02
  -2.10039839e-02  5.81175461e-02 -3.08844093e-02  8.19734111e-02
  -3.66232656e-02  2.67254841e-02 -6.38269186e-02 -4.84988689e-02
   5.53586967e-02 -5.82688898e-02  6.14239909e-02  1.16166130e-01
  -2.74743792e-03 -7.01569393e-02  1.03371078e-02 -2.10806597e-02
  -1.00392610e-01  2

#### SAMPLE ANSWER

In [72]:
cosine_value_tf = 1- pairwise_distances(match_frame, Question_tf, metric = 'cosine' )
index_value = cosine_value_tf.argmax() 
answer = data['Text Response'].iloc[index_value]
answer

'On average, the regular person is safe drinking 3-5 cups of coffee a day. Ultimately, its up to you to understand how your body responds to coffee, and to judge when you shouldn’t have any more. If you respond well to large amounts of caffeine, and don’t become jittery or get stomach pain, then you can drink what you want, but if you know that drinking lots of coffee keeps you up at night and leads to headaches in the morning, cut down to 3 cups a day. Drinking more than 5 cups of coffee a day has been linked to a raise in cholesterol, so if you’re a heavy coffee drinker sticking to 5 is the recommended amount.'

#### Parsing the Incoming Commands from Slack

In [73]:
def parse_bot_commands(slack_events):
    for event in slack_events:
        if event["type"] == "message" and not "subtype" in event:
            message = event["text"]
            return message, event["channel"], event['user']
    return None, None, None


#### How to interpret the incoming message from Slack and send appropriate responses
####  Replace SLACK USER ID with the actual id's of all the user's who can have access to this cahtbot

In [74]:
def handle_command(command, channel,user):
    # Default response is help text for the user
    accepted_users = ['<SLACK USER ID>']
    ending_text = ['bye','done', 'thanks', 'exit', 'ok', 'x']
    if user not in accepted_users:
          response = "Not an authorised user"
    elif command in ending_text:
          response = "Bye! Thanks for chatting"
    else:
      try:
        Question = nltk_cleaning(command)
        print(type(Question))
        print(Question)
        Question_tf = embed([Question])
        Question_tf = np.array(Question_tf)
        Question_tf= Question_tf.reshape(-1,512) 
        cosine_value_tf = 1- pairwise_distances(match_frame, Question_tf, metric = 'cosine' )
        index_value = cosine_value_tf.argmax() 
        response = data['Text Response'].iloc[index_value]
      except:
          response='Sorry, Not sure what you mean'
        
    

    # Sends the response back to the channel
    slack_clients.api_call(
        "chat.postMessage",
        type="divider",
        channel=channel,
        #text=':coffee:'+':coffee:'+"*"+response+"*" + ':coffee:'+':coffee:',
        attachments=[{
        "blocks":[
		{
			"type": "section",
			"block_id": "section567",
			"text": {
				"type": "mrkdwn",
				"text": ':coffee:'+':coffee:'+"*"+response+"*" + ':coffee:'+':coffee:'
			},
			"accessory": {
				"type": "image",
				"image_url": "https://png.pngtree.com/png-clipart/20190903/original/pngtree-yellow-woven-bag-of-coffee-beans-png-image_4418658.jpg",
				"alt_text": "Haunted hotel image"
			}}
		]}]
    )

##Running the application
print("Done")

Done


In [None]:
# constants
RTM_READ_DELAY = 1 # 1 second delay between reading from RTM
if __name__ == "__main__":
    if slack_clients.rtm_connect(with_team_state=False):
        print("Starter Bot connected and running!")
        # Read bot's user ID by calling Web API method `auth.test`
        starterbot_id = slack_clients.api_call("auth.test")["user_id"]
        while True:
            command, channel, user = parse_bot_commands(slack_clients.rtm_read())
            if command == 'shutdownthebot':
                break
            else:
               if command is None:
                  continue
               else:
                  handle_command(command, channel, user)
            time.sleep(RTM_READ_DELAY)
    else:
        print("Connection failed. Exception traceback printed above.")

Starter Bot connected and running!
