### Runtime info

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed May 25 21:19:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


## Load packages and mount Google drive

In [3]:
import nltk
import string
import json
import pandas as pd

#Download only once
nltk.download('punkt')  #pre-trained tokenizer for English
nltk.download('wordnet') #lexical database for the English language

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Load json

In [5]:
f = open('/content/gdrive/My Drive/MSDS 453/Final/sample.json','r',errors = 'ignore')
text = f.read()
text

'{\n    "Food & Wine": "Food & Wine is an American monthly magazine published by Dotdash Meredith. It was founded in 1978 by Ariane and Michael Batterberry.  It features recipes, cooking tips, travel information, restaurant reviews, chefs, wine pairings and seasonal/holiday content and has been credited by The New York Times with introducing the dining public to \\"Perrier, the purple Peruvian potato and Patagonian toothfish\\".\\nThe premier event for the magazine is the Food & Wine Classic in Aspen, Colorado. The Classic features wine tasting, cooking demonstrations, featured speakers, as well as a cooking competition. Held annually in June, the event is considered the kickoff to the Aspen summer season and celebrates its 38th anniversary in 2022.\\nThe winner of Top Chef, the reality television cooking competition, is featured in a spread in this magazine.",\n    "Wine and food pairing": "Wine and food matching is the process of pairing food dishes with wine to enhance the dining ex

# Create dataframe and EDA

In [6]:
df = pd.read_json('/content/gdrive/My Drive/MSDS 453/Final/sample.json', orient = 'index')
df.reset_index(inplace = True)

In [7]:
df.head()

Unnamed: 0,index,0
0,Food & Wine,Food & Wine is an American monthly magazine pu...
1,Wine and food pairing,Wine and food matching is the process of pairi...
2,Prue Leith,"Dame Prudence Margaret Leith, (born 18 Februa..."
3,Wine,Wine is an alcoholic drink typically made from...
4,Sarma Melngailis,"Sarma Melngailis (born September 10, 1972) is ..."


In [8]:
# create copy and rename columns
df = df.copy()
df = df.rename(columns = {'index':'Page', 0 : 'Text' })

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Page    25 non-null     object
 1   Text    25 non-null     object
dtypes: object(2)
memory usage: 528.0+ bytes


# Preprocessing
Source: https://analyticsindiamag.com/how-does-a-simple-chatbot-with-nltk-work/

In [10]:
# Change all text to lowercase
# Create lists of tokenized stences and words
text = text.lower()
sentences = nltk.sent_tokenize(text)
tokens = nltk.word_tokenize(text)

In [11]:
lemmer = nltk.stem.WordNetLemmatizer()
from nltk.corpus import stopwords

# Lemmatizing words or tokens
def LemTokens(tokens):
   return [lemmer.lemmatize(token,'v') for token in tokens if token not in set(stopwords.words('english')) ]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# Dictionary of integer codes
print(remove_punct_dict)

{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [12]:
# Clean and tokenize  text
def LemNormalize(text):
   return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [13]:
# Bag of words
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Q+A
Source: https://analyticsindiamag.com/how-does-a-simple-chatbot-with-nltk-work/

In [14]:
# Function to identify similarity in sentences 
def response(user_response):
       sentences.append(user_response) # Add user reponse
       cv = CountVectorizer(max_features = 50, tokenizer = LemNormalize, analyzer = 'word') # Create matrix (col = words, rows = sent, value = word count)
       X = cv.fit_transform(sentences)
       vals_cv = cosine_similarity(X[-1], X)
       indx_of_most_similar_sentence = vals_cv.argsort()[0][-2] # Sorting the indexes based on increasing similarity
       flat_vals_cv = vals_cv.flatten()
       flat_vals_cv.sort()
       highest_similarity = flat_vals_cv[-2] # TFIDF

       if(highest_similarity == 0):
              robo_response = "I am sorry! I don't understand you"
              return robo_response
       else:
              robo_response = sentences[indx_of_most_similar_sentence]
              return robo_response

In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Maintain session
exit_codes = ['bye', 'see you', 'c ya', 'exit']
flag = True
print("Hello, I am Anna's Chatbot Assistant, and I will try to answer your questions about wine!")

while(flag==True):
  user_response = input("User: ")
  if user_response.lower() not in exit_codes:
    user_response = user_response.lower()
    print("Chatbot Assistant:", response(user_response))
    sentences.remove(user_response)
    print('\nWould you like to continue asking questions? (yes or no)')
    user_response = input("User: ")

  if user_response.lower() == 'no' or user_response.lower() == 'NO' or user_response.lower() in exit_codes :
    print('Have a great day!')
    flag=False

  else :
    print ("Please ask another question.")


Hello, I am Anna's Chatbot Assistant, and I will try to answer your questions about wine!
User: how is wine made?
Chatbot Assistant: ",
    "wine": "wine is an alcoholic drink typically made from fermented grapes.

Would you like to continue asking questions? (yes or no)
