In [111]:
import os
import pandas as pd

import string
from tqdm import tqdm
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words

import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE, ADASYN
import numpy as np

BASEDIR = 'NLP_coursework'

In [112]:
class Chatbot:
    def __init__(self, prompt_vectorizer, symptoms_discerner, trials_discerner,
                 symptoms_model, trials_model, talk_model):
        """
        Construct a chatbot with pre-trained models.

        :param prompt_vectorizer: a vectorizer that should be able to handle any prompt string reasonably well
        :param symptoms_discerner: a model that can predict if the prompt is a description of symptoms
        :param trials_discerner: a model that can predict if the prompt is a request for a clinical trial        
        :param symptoms_model: a model that returns a disease by symptoms. It's predict() method should return a string
        :param trials_model: a model that returns several relevant clinical trials from a database. Should return a formatted output string
        :param talk_model: a model that tries to hold a conversation.
        """
        
        self.prompt_vectorizer = prompt_vectorizer
        self.symptoms_discerner = symptoms_discerner
        self.trials_discerner = trials_discerner
        self.symptoms_model = symptoms_model
        self.trials_model = trials_model
        self.talk_model = talk_model
        
        self.morpher = MorphAnalyzer()
        self.sw = set(get_stop_words("en"))
        self.exclude = set(string.punctuation)
        
        
    def preprocess_txt(self, text):
        spls = "".join(i for i in text.strip() if i not in self.exclude).split()
        spls = [self.morpher.parse(i.lower())[0].normal_form for i in spls]
        spls = [i for i in spls if i not in self.sw and i != ""]        
        return spls
    
    def classify_prompt(self, prompt):
        input_txt = self.preprocess_txt(prompt)
        vect = self.prompt_vectorizer.transform([" ".join(input_txt)])
        if self.symptoms_discerner.predict(vect)[0] == 1:
            return "symptoms"
    
        vect = self.prompt_vectorizer.transform([" ".join(input_txt)])
        if self.trials_discerner.predict(vect)[0] == 1:
            return "trials"
    
        return "talk"
    
    def process_symptoms(self, prompt):
        label = self.symptoms_model.predict(prompt)
        return f"I think you might have {label}, please consult a real doctor."
    
    def process_trials(self, prompt):
        return self.trials_model.get_trials(prompt)

    def process_talk(self, prompt):
        return self.talk_model.predict(prompt)
    
    def process_message(self, message):
        print(f"Got message: {message}")
        cls = self.classify_prompt(message)
        if cls == "symptoms":
            response = self.process_symptoms(message)
        elif cls == "trials":
            response = self.process_trials(message)            
        else:
            response = self.process_talk(message)

        print(f"Response: {response}")
        return response

In [113]:
import tensorflow as tf

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

class TalkModel:
    def __init__(self):
        # Загружаем сохранённую модель
        # Предобученную модель можно скачать по ссылке https://drive.google.com/drive/folders/1JBp4T9ZLVDkdC19GbY574S6oFPL9665u?usp=sharing 
        # или обучить заново при помощи ноутбука: text gen for bot.ipynb
        training_model = tf.keras.models.load_model(f'{BASEDIR}/chatbot_model', custom_objects={'loss':loss})
        
        # Подменяем ей входной слой, чтобы батч был из 1 примера
        model_config = training_model.get_config()
        model_config['layers'][0] = {
                      'name': 'new_input',
                      'class_name': 'InputLayer',
                      'config': {
                          'batch_input_shape': (1, None),
                          'dtype': 'float32',
                          'sparse': False,
                          "ragged": False,
                          'name': 'modified_input'
                      },
                      'inbound_nodes': []
                  }
        model_config['layers'][1]['inbound_nodes'] = [[['new_input', 0, 0, {}]]]

        # Создаём рабочую модель из изменённой конфигурации и копируем веса
        self.model = training_model.__class__.from_config(model_config, custom_objects={'loss':loss})  # change custom objects if necessary
        self.model.set_weights(training_model.get_weights())        
        
        # Загружаем преобразование предсказаний (чисел) в символы и обратно
        self.stoi = {}
        self.itos = []
        with open(f'{BASEDIR}/chatbot_vocab.txt', 'r') as f:
            for i, ch in enumerate(f):
                self.stoi[ch[0]] = i
                self.itos.append(ch[0])        
    
    def convert_text(self, text):            
        dix = [self.stoi[s] for s in text]

        return tf.convert_to_tensor(dix, dtype=tf.int32)
    
    def predict(self, message):
        input_eval = self.convert_text(message)
        input_eval = tf.expand_dims(input_eval, 0)

        # Empty string to store our results
        text_generated = []

        # Low temperature results in more predictable text.
        # Higher temperature results in more surprising text.
        # Experiment to find the best setting.
        temperature = 0.5

        self.model.reset_states()
        last_char = ''
        while last_char not in ['!', '.', '?']:
            predictions = self.model(input_eval)
            predictions = tf.squeeze(predictions, 0)
            # using a categorical distribution to predict the character returned by the model
            predictions = predictions / temperature
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

            # Pass the predicted character as the next input to the model
            # along with the previous hidden state
            input_eval = tf.expand_dims([predicted_id], 0)
            last_char = self.itos[predicted_id]
            text_generated.append(last_char)
            
        return ''.join(text_generated)

In [114]:
morpher = MorphAnalyzer()
sw = set(get_stop_words("en"))
exclude = set(string.punctuation)

def preprocess_txt(line):
    spls = "".join(i for i in line.strip() if i not in exclude).split()
    spls = [morpher.parse(i.lower())[0].normal_form for i in spls]
    spls = [i for i in spls if i not in sw and i != ""]
    return spls

def create_talkbot_lines():
    all_transcripts = []
    
    for filename in os.listdir(f'{BASEDIR}/House MD'):
        all_transcripts.append(pd.read_csv(f'{BASEDIR}/House MD/{filename}', encoding= 'unicode_escape'))
    
    talkbot_sentences = pd.concat(all_transcripts)    
    
    talkbot_lines = []
    for line in tqdm(talkbot_sentences.line):
        if isinstance(line, str):
            talkbot_lines.append(preprocess_txt(line))
        
    return talkbot_lines

def create_symptom_lines():
    df_symptoms = pd.read_csv(f'{BASEDIR}/Symptom2Disease.csv')

    symptom_lines = []
    for line in tqdm(df_symptoms.text):    
        if isinstance(line, str):
            sentences = line.split(".")
            for s in sentences:
                symptom_lines.append(preprocess_txt(s))
                
    return symptom_lines

def create_trials_lines():
    df_trials = pd.read_csv(f'{BASEDIR}/studies.csv')
    
    trials_lines = []
    for (index, row) in tqdm(df_trials.iterrows()):    
        if isinstance(row.official_title, str) and len(row.official_title) > 0:
            line = row.official_title        
        else:
            line = row.brief_title
    
        trials_lines.append(preprocess_txt(line))
        
    return trials_lines

def train_lr_model(positive_texts, list_of_negative_texts, vectorizer):    
    negative_texts = []
    for nt_list in list_of_negative_texts:
        for nt in nt_list:
            negative_texts.append(nt)    
    
    negative_texts = [" ".join(nt) for nt in negative_texts]
    positive_texts = [" ".join(pt) for pt in positive_texts]    
    
    dataset = negative_texts + positive_texts
    labels = np.zeros(len(dataset))
    labels[len(negative_texts):] = np.ones(len(positive_texts))    
    
    dataset = vectorizer.transform(dataset)
    
    dataset, labels = SMOTE().fit_resample(dataset, labels)
        
    X_train, X_test, y_train, y_test = train_test_split(dataset, labels, test_size=0.2, stratify=labels,
                                                    random_state=13)
    
    lr = LogisticRegression(max_iter=200).fit(X_train, y_train)
        
    score = accuracy_score(y_true=y_test, y_pred=lr.predict(X_test))
    
    return (lr, score)

def build_vectorizer(list_of_list_of_text):
    all_texts = []
    for l in list_of_list_of_text:
        for t in l:
            all_texts.append(" ".join(t))
            
    vectorizer = CountVectorizer(ngram_range=(1, 2))    
    vectorizer.fit(all_texts)
    
    return vectorizer

def create_chatbot_from_scratch():
    print("Loading lines for talk")    
    talkbot_lines = create_talkbot_lines()
    print("Loading lines for symptoms")
    symptom_lines = create_symptom_lines()
    print("Loading lines for trials")
    trials_lines = create_trials_lines()
    
    print("Training vectorizer")
    vectorizer = build_vectorizer([symptom_lines, trials_lines, talkbot_lines])
    
    print("Training symptoms discerning model")
    symptoms_lr, acc1 = train_lr_model(symptom_lines, [trials_lines, talkbot_lines], vectorizer)
    print(f"Finished, accuracy={acc1}")
    print("Training symptoms trials model")
    trials_lr, acc2 = train_lr_model(trials_lines, [symptom_lines, talkbot_lines], vectorizer)
    print(f"Finished, accuracy={acc2}")

    # Сохранить модель целиком и переносимо не получилось, будем тренировать заново
    %run NLP_coursework/symptoms_2_diseases.ipynb
    symptoms_model = SymptomsModel()
    symptoms_model.train(f'{BASEDIR}/Symptom2Disease.csv')
    
    %run NLP_coursework/clinical_trials.ipynb
    trials_model = TrialsModel()
    trials_model.train(f'{BASEDIR}/studies_with_keywords.pickle')
    
    print("Creating chatbot")
    chatbot = Chatbot(vectorizer, symptoms_lr, trials_lr, symptoms_model, trials_model, TalkModel())
    print("Done!")
    return chatbot

In [115]:
chatbot = create_chatbot_from_scratch()

100%|██████████| 75312/75312 [00:17<00:00, 4374.13it/s]
100%|██████████| 1200/1200 [00:00<00:00, 1798.79it/s]
27726it [00:11, 2368.27it/s]


Loading lines for talk
Loading lines for symptoms
Loading lines for trials
Training vectorizer
Training symptoms discerning model
Finished, accuracy=0.9635569574184156
Training symptoms trials model
Finished, accuracy=0.9633165829145729
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Reading dataset
Preprocessing brief titles
Updating keywords
Creating FT model
Creating index
Building index
Creating chatbot
Done!


In [119]:
from telegram import Update
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters, CallbackContext
from telegram import ParseMode

updater = Updater("6068772572:AAHzY182akc1aR2h2SYpfW8mS7P3cLYMLag", use_context=True) # Токен API к Telegram
dispatcher = updater.dispatcher

def startCommand(update, context):
    context.bot.send_message(chat_id=update.message.chat_id, text='Hi. I\'m a bot trained in all things medical. Let\'s talk?\n' + 
                             'You can ask me to show you some clinical trials from a database (e.g. Show me some clinical trials about chronic pain)\n' +
                             'Or you can describe me some symptoms, and I\'ll try to guess a disease (e.g. I have a headache)\n' + 
                             'Or you can just chat with me (I\'m trained on House MD script lines!)')
    
def textMessage(update, context):
    response = chatbot.process_message(update.message.text)
    context.bot.send_message(chat_id=update.message.chat_id, text=response, parse_mode=ParseMode.HTML)
        
start_command_handler = CommandHandler('start', startCommand)
text_message_handler = MessageHandler(Filters.text, textMessage)
dispatcher.add_handler(start_command_handler)
dispatcher.add_handler(text_message_handler)
updater.start_polling(clean=True)
updater.idle()

  updater.start_polling(clean=True)


Got message: Show me some clinical trials about chronic paint
Response: I have found some clinical trials that might be related to this:

<a href="https://clinicaltrials.gov/ct2/show/NCT00195949">Laparoscopic Versus Open Pyloromyotomy for Infants With Idiopathic Hypertrophic Pyloric Stenosis</a>
<a href="https://clinicaltrials.gov/ct2/show/NCT00156403">A Pilot Study of Use of Calcium Channel Blocker to Decrease Inflammation and Pain in Hereditary Pancreatitis</a>
<a href="https://clinicaltrials.gov/ct2/show/NCT00080223">Safety Study of Oral Pirfenidone in Patients With Pulmonary Fibrosis/Idiopathic Pulmonary Fibrosis</a>
<a href="https://clinicaltrials.gov/ct2/show/NCT00144196">12 Week Efficacy of Tiotropium Versus Placebo in Patients With Mild COPD According to Swedish Guidelines (SPIRIMILD)</a>
<a href="https://clinicaltrials.gov/ct2/show/NCT00268762">Argatroban Stroke Treatment - A Pilot Safety Study</a>



In [118]:
#%pip install python-telegram-bot==13.8