In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder

In [15]:
class SymptomsModel:
    def train(self, filename):
        diseases = pd.read_csv(filename)
        num_classes = diseases.label.nunique()

        label_encoder = LabelEncoder()
        label_encoder.fit(diseases.label)

        X, y = diseases.text, diseases.label
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            test_size=0.2, 
                                                            random_state=21, 
                                                            shuffle=True,
                                                            stratify=y)

        train_labels = label_encoder.transform(y_train)
        test_labels = label_encoder.transform(y_test)

        max_vocab_length = 10000
        max_length = 45
        text_vectorizer = TextVectorization(
            max_tokens=max_vocab_length,
            output_mode='int',
            output_sequence_length=max_length)

        text_vectorizer.adapt(diseases.text)

        embedding = layers.Embedding(input_dim=max_vocab_length, 
                                     output_dim=128, 
                                     input_length=max_length,
                                     embeddings_initializer='uniform')

        inputs = layers.Input(shape=(1,), dtype=tf.string)
        x = text_vectorizer(inputs)
        x = embedding(x)
        x = layers.GlobalAveragePooling1D()(x)
        outputs = layers.Dense(num_classes, activation ='sigmoid')(x)
        model_1 = tf.keras.Model(inputs, outputs, name='dense_model')

        model_1.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                        optimizer = tf.keras.optimizers.Adam(), 
                        metrics=["accuracy"])

        OHE = OneHotEncoder(sparse_output=False)
        y_train_encoded = OHE.fit_transform(y_train.to_numpy().reshape(-1, 1))
        y_test_encoded = OHE.fit_transform(y_test.to_numpy().reshape(-1, 1))

        model_1.fit(np.array(X_train), 
                                      y_train_encoded,
                                      epochs=50,
                                      validation_data=(np.array(X_test), y_test_encoded))
        
        self.model = model_1
        self.label_encoder = label_encoder
        
    def predict(self, text):
        label_index = np.argmax(self.model.predict([text]))
        return self.label_encoder.inverse_transform([label_index])[0]