<a href="https://colab.research.google.com/github/bouazzaayyoub/angular-dynamic-form/blob/master/chatbot_v2_ipynp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [None]:
%%capture
!python3 -m spacy download fr_core_news_md
!python3 -m spacy download en_core_web_sm
!pip install fuzzywuzzy

In [None]:
import nltk
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import wordnet
import fr_core_news_md,en_core_web_sm
import re,json,random
from urllib.request import urlopen

In [None]:
%%capture
nltk.download('punkt')
nltk.download('wordnet')

# Useful functions

### remove duplication

In [None]:
def remove_duplication(word):
  """
    if the input word is similar to an english word return the input word 
    else remove duplications and search again for similar english words
  """
  repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
  repl = r'\1\2\3'
  if wordnet.synsets(word):
    return word
  repl_word = repeat_regexp.sub(repl, word)
  if repl_word != word:
    return remove_duplication(repl_word)
  else:
    return repl_word
remove_duplication("hello"),remove_duplication("blaaaaaabla")

('hello', 'blabla')

### Lemmetization & Stemming

In [None]:
# Les fonctions pour séparer les mots et les transformer  vers leurs origin gramatical 
spacy_fr=fr_core_news_md.load()
spacy_en = en_core_web_sm.load()
# Convertir les francais mots vers leurs origin
fr_lemmatizer = lambda w:spacy_fr(w)[0].lemma_
# Convertir les mots anglais vers leurs origin
eng_lemmatizer = lambda w:spacy_en(w)[0].lemma_
# Convertir les mots arabe vers leurs origin
ar_lemmatizer = ISRIStemmer().stem
lemmatizer = lambda word: ar_lemmatizer(fr_lemmatizer(eng_lemmatizer(remove_duplication(word))))

In [None]:
lemmatizer("السلااام"),lemmatizer("donne"),lemmatizer("yeeux")

('سلم', 'donne', 'oeil')

# Importing dataset

In [None]:
data_file = urlopen('https://raw.githubusercontent.com/DadiAnas/AI-Chatbot-FlaskServer/master/datasets/intents.json').read() #dataset_import
intents = json.loads(data_file) #dataset_JsonParser

#Preparing Dataset




## Oranize data in lists

In [None]:
#les variables utilisés
words=[] #words
classes = [] #tag
documents = [] # (pattern,tag)
ignore_words = ['?', '!',';','.',','] #words to ignore

#mettre les mot dans words
#catégorisation des patterne selon tag 
#ajouter tag dans la list classes 
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # take each word and tokenize it
        w = nltk.word_tokenize(pattern)
        words.extend(w)
        # adding documents
        documents.append((w, intent['tag']))
        # adding classes to our class list
        classes.append(intent['tag'])

## Lemmetazering & steeming words list

In [None]:
#changer les mot vers leurs origine et ignorer "?,!
words = [lemmatizer(w.lower()) for w in words if w not in ignore_words]

## remove duplication & sort

In [None]:
#trier (pour le training) et éviter la redondance
words = sorted(list(set(words)))
classes = sorted(list(set(classes)))

In [1]:
print (len(documents), "documents",documents)

print (len(classes), "classes", classes)

print (len(words), "unique lemmatized words", words)

NameError: ignored

### Make Training dataset

In [None]:
# initializing training data
training = []
output_empty = [0] * len(classes)
for doc in documents:
    # initializing bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
      bag.append(1 if w in pattern_words else 0)

    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])
# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists. X - patterns, Y - intents
X = list(training[:,0])
y = list(training[:,1])
print("Training data created")
print(X[0])
print(y[0])

Training data created
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,



In [None]:
len(X)

442

## Split Dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
len(X_train),len(X_test)

(331, 111)

#Build ANN model

### Initialize the model

In [None]:
model = Sequential()

### Add input layer 128 neurons, relu activation | Adding Droupout to avoid overfitting

In [None]:
model.add(Dense(128, input_shape=(len(X_train[0]),), activation='relu'))
model.add(Dropout(0.5))#avoid overfitting 

### Add hidden layer 64 neurons, relu activation | Adding Droupout to avoid overfitting

In [None]:
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

### Add output layer number of neurons equal to number of intents, softmax activation 

In [None]:
model.add(Dense(len(y_train[0]), activation='softmax'))

### Compile model. Stochastic gradient descent with Nesterov to accelerated gradient 

In [None]:
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

Build the model

In [None]:
model.build()

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               74624     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 25)                1625      
Total params: 84,505
Trainable params: 84,505
Non-trainable params: 0
_________________________________________________________________


#Training model

In [None]:
#fitting the model
model.fit(np.array(X_train), np.array(y_train), epochs=200, batch_size=10,  verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fc3b4561208>

# Evaluate the model

In [None]:
def find_best_threshold():
  ERROR_THRESHOLD = 0.001
  accuracy,step =0,0
  while step < 1:
    y_pred = model.predict(X_test) > step
    if metrics.accuracy_score(y_test,y_pred) > accuracy:
      accuracy = metrics.accuracy_score(y_test,y_pred)
      ERROR_THRESHOLD = step
    step += 0.01
  return ERROR_THRESHOLD
ERROR_THRESHOLD = find_best_threshold()
print(ERROR_THRESHOLD)

0.20000000000000004


In [None]:
y_pred = model.predict(X_test) > ERROR_THRESHOLD
metrics.accuracy_score(y_test,y_pred)

0.5855855855855856

# Use the model

In [None]:
def clean_up_sentence(sentence):
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [lemmatizer(word.lower()) for word in sentence_words]
   
    return sentence_words

def bow(sentence, words, show_details=True):
    """
    return bag of words array: 0 or 1 for each word in the bag that match 60% another word in the sentence
    """
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)
    for s in sentence_words:
        for i,w in enumerate(words):
            if w==s:
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)
    return(np.array([np.array(bag)]))

def predict_class(sentence, model):
    """
      looking for the class of the sentence
    """
    # filter out predictions below a threshold
    p = bow(sentence, words,show_details=False)
    res = model.predict(p)[0]
    results = [[i,r] for i,r in enumerate(res) if r>ERROR_THRESHOLD]
    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})
    return return_list

def getResponse(ints, intents_json):
    """
      search for response in the predicted class 
    """
    tag = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag']== tag):
            result = (random.choice(i['responses']),tag)
            break
    return result

def chatbot_response(msg):
    ints = predict_class(msg, model)
    res = getResponse(ints, intents)
    return res


In [None]:
resp = chatbot_response("slaaam")[0]
resp

'سلااام، كي داير بيخير؟'

In [None]:
def make_conversation():
  resp = ('','')
  while resp[1] != 'good_bye':
    user_msg = str(input('Me:'))
    resp = chatbot_response(user_msg)
    print(f'chatbot:{resp[0]}')
make_conversation()


Me:السلام و عليكم
chatbot:عليكم السلام كيف يمكنني مساعدتك
Me:عفاك بيت نسولك
chatbot:تقدر تسولني على أي حاجة متعلقة بفيروس كورونا :الاحصائيات ,مستجدات,و معلومات عامة عن فيروس كورونا
Me:ok, chno hya corona ?
chatbot:مرض كوفيد-19 هو مرض معد يسببه آخر فيروس تم اكتشافه من سلالة فيروسات كورونا. ولم يكن هناك أي علم بوجود هذا الفيروس الجديد ومرضه قبل بدء تفشيه في مدينة ووهان الصينية في كانون الأول/ ديسمبر 2019. وقد تحوّل كوفيد-19 الآن إلى جائحة تؤثر على العديد من بلدان العالم.
Me:chokran
chatbot:مرحبا فأي وقت
Me:chkun nta b3da ?
chatbot:أنا شاطبوت،تم التصنيع ديالي من طرف طلبة مهندسين و الدور ديالي نجاوب على أي تساؤل عندو علاقة بكرونا، ويلا معرفتش دبا الجواب غنعرف من بعد و اجي سولني 
Me:هههههه اوكي
chatbot:شكرا ليك
Me:الله اعاونك
chatbot:مرحبا فأي وقت
Me:yalah bye db
chatbot:بسلامة
