In [None]:
import os, random, gzip, json, re
import tensorflow as tf
import tensorflow.keras as keras
import encoder_client
import numpy as np

In [None]:
DIALOGUE_FILE = "en-comedy.txt.gz"
FIRST_NAMES = "first_names.json"
MODEL_URI = "https://home.nr.no/~plison/data/model.tar.gz"

In [None]:
class Chatbot:
    """A dual encoder model for a Retrieval chatbot"""
    
    def __init__(self, dialogue_data = DIALOGUE_FILE):
        """Initialises a chatbot based on a Dual Encoder architecture, with
        utterances encoded using the pre-trained ConveRT model 
        (https://arxiv.org/abs/1911.03688)."""
        
        # Extracts the (context, response) pairs
        self.pairs = self._extract_pairs(dialogue_data)
        
        # Loads the ConveRT utterance encoder
        self.client = encoder_client.EncoderClient("https://home.nr.no/~plison/data/model.tar.gz")
        
        # Compute the embeddings for the responses (takes some time to compute!)
        responses = [response for _, response in self.pairs]
        self.response_embeddings = self.client.encode_responses(responses)
        
    '''
    def _cond_satisfied(self, column, names, outputs):
        self.outputs = outputs
        if not column.isupper():
            if column[0] not in ["♪"]:
                if not any(i in "-¶,()[]:;" for i in column):
                    if "..." not in column:
                        if column[0:3] not in ["###"]:
                            if len(column.split()) < 10:
                                if len(column.split()) > 1 and not any(i in names for i in self.outputs):
                                    return True
        else:
            return False

        '''

    def _extract_pairs(self, dialogue_data, max_nb_pairs=25000):
        """Given a file containing dialogue data, extracts a list of relevant
        (context,response) pairs, where both the context and response are
        strings. The 'context' is here simply the first utterance (such as a question),
        and the 'response' the following utterance (such as an answer).

        The (context, response) pairs should satisfy the following critera:
        - The two strings should be consecutive, and part of the same movie/TV series
        - Pairs in which one string contains commas, parentheses, brackets, colons, 
          semi-colons  or double quotes should be discarded.
        - Pairs in which one string is entirely in uppercase should be discarded
        - Pairs in which one string contains more than 10 words should be discarded
        - Pairs in which one string contains a first name should be discarded 
          (see the json file FIRST_NAMES to detect those).
        - Pairs in which the context string only contains one token should be discarded.

        You are of course free to add additional critera to increase the quality of your
        (context,response) pairs. You should stop the extract once you have reached 
        max_nb_pairs.

        """
        #raise NotImplementedError()

        self.outputs = []
        nb_pair = 0

        results = open("chatbot_output_file", "w")
        #'''
        def _cond_satisfied(column):
            if not any(c in "-¶,()[]:;""" for c in column) and not column.isupper() and len(column.split()) < 10 and len(column.split()) > 1 and not any(c in names for c in column) and column[0] != '♪' and '...' not in column and column[0:3] != '###':
                return True
            
            else:
                return False
        #'''

        with open(FIRST_NAMES) as json_file:
            names = json.load(json_file)

        with open(dialogue_data, "rb") as gzip_file:
            file = gzip.GzipFile(fileobj = gzip_file)
            columns = file.readlines()

            for context, response in zip(columns[0::2], columns[1::2]):
                if nb_pair > max_nb_pairs:
                    break

                context  = context.decode("utf-8").strip()
                response = response.decode("utf-8").strip()

                #if self._cond_satisfied(context, names, self.outputs):
                if _cond_satisfied(context):
                    #if self._cond_satisfied(response, names, self.outputs):
                    if _cond_satisfied(response):
                        #print("hi")
                        self.outputs.append((context, response))
                        results.write(f"{(context, response)}")
                        results.write("\n")
                        nb_pair = nb_pair + 1

        results.close()
        #print(f"outputs: {self.outputs}")
        return self.outputs
    
    def get_response(self, user_utterance):
        """Extracts the context embedding for the user utterance, and then computes
        the dot product of this embeddings with all the response embeddings (already
        computed in self.response_embeddings). The response with the highest dot 
        product is then selected. 
        
        To get the context embedding for the user utterance, simply use the method
        client.encode_contexts(...).
        
        The method returns a string with the response of the chatbot."""
        
        #raise NotImplementedError()

        encoded_utterance = self.client.encode_contexts([user_utterance])

        dot_product = []
        for i in self.response_embeddings:
            dot_product.append(np.dot(encoded_utterance, i))
        ioptimal = dot_product.index(max(dot_product))

        return self.pairs[ioptimal][1]
        #return None
    
    def fine_tune(self):
        """Fine-tunes the dual encoder model by computing a transformation (linear 
        transformation + non-linear ReLU activation) of the response embeddings, 
        optimised on the (context, response) pairs extracted for the dataset. 
        The method updates the response embeddings with the transformed values"""
        
        # Extract the training data (with both positive and negative examples)
        context_embeddings2, response_embeddings2, outputs = self._get_training_data()
        
        # Creates the two input layers (for the two embeddings)
        input1 = tf.keras.layers.Input((context_embeddings2.shape[1],))
        input2 = tf.keras.layers.Input((response_embeddings2.shape[1],))
        
        # Computes the linear transformation of the response embeddings
        # (initialized with an identity matrix)
        dense2 = tf.keras.layers.Dense(response_embeddings2.shape[1], 
                                       kernel_initializer="identity", bias_initializer="zeros")
        
        # Add dropout for regularisation
        dropout = tf.keras.layers.Dropout(0.5)
                
        # Computes the dot product, and pass through a sigmoid to get a probability
        dotproduct = tf.keras.layers.Dot(axes=1)
        sigmoid = tf.keras.layers.Activation(tf.keras.activations.sigmoid)

        # Connects together all layers
        output_prob = sigmoid(dotproduct([dropout(input1), dense2(input2)]))
        
        # Creates a new model, specifying the inputs and output
        model = tf.keras.Model([input1, input2], output_prob)
        model.summary()   
       
        # Compile the model the "Adam" optimiser and a cross-entropy loss
        model.compile(loss="binary_crossentropy", optimizer="adam")
        
        # Train the model on 3 epochs
        model.fit([context_embeddings2, response_embeddings2], outputs, 
                  batch_size=8,epochs=3, validation_split=0.1)
        
        # Once the model is trained, we simply transform the response embeddings using
        # the transformation we have learned
        embeddings_tensor = dense2(self.response_embeddings)
        self.response_embeddings = tf.keras.backend.eval(embeddings_tensor)
        
        
    def _get_training_data(self):
        """Constructs a dataset to fine-tune the dual encoder. The dataset should
        contain both positive examples (that is, pairs of context and response embeddings
        that do correspond to actual response pairs) and negative examples (pairs of context
        and response embeddings that are selected at random and are not related).
        
        More precisely, the method should return 3 outputs:
        - one matrix of shape (2*len(self.pairs), 512) with context embeddings from the pairs 
        - one matrix of shape (2*len(self.pairs), 512) with response embeddings from the pairs
        - one array of shape 2*len(self.pairs) with binary values
        
        Half of the training examples should be positive (actual pair of embeddings) and half 
        should be negative (pair of embeddings selected at random), which is why the total length
        of the training data is twice the number of (context, response) pairs. For positive examples,
        the corresponding value in the output array should be 1, and 0 for negative examples.
        
        The response embeddings have already been computed (in self.response_embeddings) so you
        don't need to compute them again. But you need to compute the context embeddings for your
        pairs using the method client.encode_contexts(contexts). 
        
        Note that the positive and negative examples should be shuffled (to avoid confusing the
        machine learning model by first starting with only positive examples, then having on/uio/hume/student-u84/aleksda/Documents/aleksda_mandatory_3/ly 
        negative examples).
        """

        #raise NotImplementedError()

        context_embeddings  = self.client.encode_contexts([c for context, d in self.pairs])
        response_embeddings = self.response_embeddings

        temp_outputs = [1 for _ in range(len(context_embeddings))] + [0 for _ in range(len(context_embeddings))]

        temp_context_train  = []
        temp_response_train = []

        for i, j in zip(range(len(context_embeddings)), range(len(response_embeddings))):
            temp_context_train .append(response_embeddings[random.randint(0, len(context_embeddings) - 1)])
            temp_response_train.append(context_embeddings[random.randint(0, len(response_embeddings) - 1)])

        temp_context_train  = context_embeddings  + temp_context_train
        temp_response_train = response_embeddings + temp_response_train

        meshed_list = list(zip(temp_context_train, temp_response_train, temp_outputs))
        meshed_list = random.shuffle(meshed_list)

        context_train, response_train, outputs = zip(meshed_list[0], meshed_list[1], meshed_list[2])

        return np.asarray(context_train),np.asarray(response_train), np.asarray(outputs)

In [None]:
cb = Chatbot()

In [None]:
your_inputs = ["Who are you?", "How old are you?", "Where are you?", "Are you stupid?", "Did you kill him?"]
for your_input in your_inputs:
    print(f'You: {your_input}')
    print(f'Chatbot: {cb.get_response(your_input)}')
    print('\n')

In [None]:
cb.fine_tune()
your_inputs = ["Who are you?", "How old are you?", "Where are you?", "Are you stupid?", "Did you kill him?"]
for your_input in your_inputs:
    print(f'You: {your_input}')
    print(f'Chatbot: {cb.get_response(your_input)}')
    print('\n')