In [None]:
from Tools.Tokenizer.tokenizer import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [None]:
stop_words = set(stopwords.words('armenian')) 

# Helpers
def tokenize(text):
    T = Tokenizer(text)
    T.segmentation().tokenization()
    word_tokens = []
    for segment in T.segments:
        for token in segment['tokens']:
            word_tokens.append(token[1].lower())
    return word_tokens

def remove_stopwords(word_tokens):
    filtered_word_tokens = [w for w in word_tokens if not w in stop_words]
    return filtered_word_tokens

def stemming(word_tokens):
    stemmer = SnowballStemmer("armenian") 
    filtered_word_tokens = [ stemmer.stem(w) for w in word_tokens ]
    return filtered_word_tokens

In [None]:
import pandas as pd
import nltk

# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

# BinaryRelevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression

In [None]:
# Load Training data
df = pd.read_csv('data/clean/Trainning_reviews.csv')
initial_df = pd.read_csv('data/clean/Trainning_reviews.csv')

# cleaning up text
df['Description'] = df['Description'].apply(lambda row : row + '։') 
df['Description'] = df['Description'].apply(tokenize)

df['Description'] = df['Description'].apply(remove_stopwords)
df['Description'] = df['Description'].apply(stemming)
df['Description'] = df['Description'].apply(lambda row : ' '.join(row)) 

df.rename(columns = {'Symptoms':'symptom_list'}, inplace = True) 

In [None]:
# extract symptoms
symptoms = [] 

for i in df['symptom_list']: 
    symptoms.append(i.split(', ')) 
# add to  dataframe  
df['Symptoms'] = symptoms

In [None]:
# get all symptom tags in a list
all_symptoms = sum(symptoms,[])

In [None]:
all_symptoms = nltk.FreqDist(all_symptoms) 
# create dataframe
all_symptoms_df = pd.DataFrame({'Symtom': list(all_symptoms.keys()), 
'Count': list(all_symptoms.values())})

In [None]:
# Vectorization
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['Symptoms'])
# transform target variable
y = multilabel_binarizer.transform(df['Symptoms'])

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [None]:
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(df['Description'], y, test_size=0.2, random_state=9)

In [None]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [None]:
# Binary Relevance
br_classifier = BinaryRelevance(LogisticRegression(C=40,class_weight='balanced'))
br_classifier.fit(xtrain_tfidf, ytrain)
br_predictions = br_classifier.predict(xval_tfidf)

print("Accuracy = ",accuracy_score(yval,br_predictions.toarray()))
print("F1 score = ",f1_score(yval,br_predictions, average="micro"))
print("Hamming loss = ",hamming_loss(yval,br_predictions))

In [None]:
def infer_tags(q):
    print(q)
    q_vec = tfidf_vectorizer.transform([q])

    q_pred = br_classifier.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

In [None]:
for i in range(5): 
    k = xval.sample(1).index[0] 
    print("Կարծիք: ", initial_df['Description'][k])
    print("Կանխատեսում: ", infer_tags(xval[k]))
    print("Իրականություն: ",df['Symptoms'][k], "\n")

In [None]:
text = "մաշկիս վրա պզուկներ են առաջացել։ դուրս է թափվել։ հետ տալ։"
tokenized = tokenize(text)
stopwords_removed = remove_stopwords(tokenized)
stemmed = stemming(stopwords_removed)
cleanedText = ' '.join(stemmed)
print(cleanedText)
infer_tags(cleanedText)

In [None]:
import model
q = model.infer_tags(cleanedText)
q[0]

In [None]:
def clean_up(text):
    tokenized = tokenize(text+"։")
    stopwords_removed = remove_stopwords(tokenized)
    stemmed = stemming(stopwords_removed)
    cleanedText = ' '.join(stemmed)
    return cleanedText

In [None]:
clean_up("մաշկիս վրա պզուկներ են առաջացել։ դուրս է թափվել։ հետ տալ")

In [None]:
# save the model to disk
import pickle

filename = 'finalized_model.sav'
pickle.dump(model.py, open(filename, 'wb'))

In [None]:
import pandas as pd
import nltk
import pickle

# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

# BinaryRelevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression

import TextCleaningHelper


# Load Training data
df = pd.read_csv('data/clean/Trainning_reviews.csv')
initial_df = pd.read_csv('data/clean/Trainning_reviews.csv')

# cleaning up datasource 
df['Description'] = df['Description'].apply(TextCleaningHelper.clean_up)

df.rename(columns = {'Symptoms':'symptom_list'}, inplace = True) 

# extract symptoms
symptoms = [] 

for i in df['symptom_list']: 
    symptoms.append(i.split(', ')) 
# add to  dataframe  
df['Symptoms'] = symptoms

# get all symptom tags in a list
all_symptoms = sum(symptoms,[])

all_symptoms = nltk.FreqDist(all_symptoms) 
# create dataframe
all_symptoms_df = pd.DataFrame({'Symtom': list(all_symptoms.keys()), 
'Count': list(all_symptoms.values())})

# Vectorization
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['Symptoms'])
# transform target variable
y = multilabel_binarizer.transform(df['Symptoms'])

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(df['Description'], y, test_size=0.2, random_state=9)

# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

# Binary Relevance
br_classifier = BinaryRelevance(LogisticRegression(C=40,class_weight='balanced'))
br_classifier.fit(xtrain_tfidf, ytrain)
br_predictions = br_classifier.predict(xval_tfidf)

print("Accuracy = ",accuracy_score(yval,br_predictions.toarray()))
print("F1 score = ",f1_score(yval,br_predictions, average="micro"))
print("Hamming loss = ",hamming_loss(yval,br_predictions))

# public methods
def infer_tags(q):
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = br_classifier.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(br_classifier, open(filename, 'wb'))

In [None]:
from flask import Flask, render_template, url_for, request, jsonify

import numpy as np
import pickle


app = Flask(__name__)
loaded_model = pickle.load(open("finalized_model.sav", "rb"))

In [None]:
def infer_tags(q):
    q_vec = tfidf_vectorizer.transform([q])
    q_pred = loaded_model.predict(q_vec)
    return multilabel_binarizer.inverse_transform(q_pred)

In [None]:
infer_tags("հետ տալ")

In [None]:
import numpy as np
import pandas as pd
import nltk
import pickle

# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import TextCleaningHelper

# Load Training data
df = pd.read_csv('data/clean/Trainning_reviews.csv')
initial_df = pd.read_csv('data/clean/Trainning_reviews.csv')

# cleaning up datasource 
df['Description'] = df['Description'].apply(TextCleaningHelper.clean_up)

df.rename(columns = {'Symptoms':'symptom_list'}, inplace = True) 

# extract symptoms
symptoms = [] 

for i in df['symptom_list']: 
    symptoms.append(i.split(', ')) 
# add to  dataframe  
df['Symptoms'] = symptoms

# get all symptom tags in a list
all_symptoms = sum(symptoms,[])

all_symptoms = nltk.FreqDist(all_symptoms) 
# create dataframe
all_symptoms_df = pd.DataFrame({'Symtom': list(all_symptoms.keys()), 
'Count': list(all_symptoms.values())})

# Vectorization
from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['Symptoms'])
# transform target variable
y = multilabel_binarizer.transform(df['Symptoms'])

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(df['Description'], y, test_size=0.2, random_state=9)

# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)


# save the model to disk
filename = 'tfidf_vectorizer_model.sav'
pickle.dump(tfidf_vectorizer, open(filename, 'wb'))

In [None]:
import numpy as np
import pandas as pd
import nltk
import pickle

# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import TextCleaningHelper

# Load Training data
df = pd.read_csv('data/clean/Trainning_reviews.csv')
initial_df = pd.read_csv('data/clean/Trainning_reviews.csv')

# cleaning up datasource 
df['Description'] = df['Description'].apply(TextCleaningHelper.clean_up)

df.rename(columns = {'Symptoms':'symptom_list'}, inplace = True) 

# extract symptoms
symptoms = [] 

for i in df['symptom_list']: 
    symptoms.append(i.split(', ')) 
# add to  dataframe  
df['Symptoms'] = symptoms

# get all symptom tags in a list
all_symptoms = sum(symptoms,[])

all_symptoms = nltk.FreqDist(all_symptoms) 
# create dataframe
all_symptoms_df = pd.DataFrame({'Symtom': list(all_symptoms.keys()), 
'Count': list(all_symptoms.values())})


from sklearn.preprocessing import MultiLabelBinarizer
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['Symptoms'])

# save the model to disk
filename = 'multilabel_binarizer_model.sav'
pickle.dump(multilabel_binarizer, open(filename, 'wb'))

In [3]:
import tablib

ds = tablib.Dataset()
ds.csv = open("data/clean/disease_description.csv").read()

dictionary = dict(ds)
# print(dictionary)
models = list()

for i in dictionary:
    models.append( {

        "label" : i.replace(" ", ""),
        "name" : dictionary[i]
    })

print(models)
# return jsonify(models)
# disease_description
# print(dictionary['itching'])

[{'label': 'itching', 'name': 'Քոր'}, {'label': 'skin_rash', 'name': 'մաշկի ցան'}, {'label': 'nodal_skin_eruptions', 'name': 'Բշտիկներ'}, {'label': 'continuous_sneezing', 'name': 'Շարունակական  փռշտոց'}, {'label': 'shivering', 'name': 'դողէրոցք'}, {'label': 'chills', 'name': 'սարսուռ'}, {'label': 'joint_pain', 'name': 'հոդերի ցավ'}, {'label': 'stomach_pain', 'name': 'ստամոքսի ցավ '}, {'label': 'acidity', 'name': 'թթվայնություն'}, {'label': 'ulcers_on_tongue', 'name': 'բշտիկներ լեզվի վրա'}, {'label': 'muscle_wasting', 'name': 'մկանների թուլացում'}, {'label': 'vomiting', 'name': 'փսխում'}, {'label': 'burning_micturition', 'name': 'այրվող միզել'}, {'label': 'spotting_urination', 'name': 'միզել'}, {'label': 'fatigue', 'name': 'հոգնածություն'}, {'label': 'weight_gain', 'name': 'քաշի ավելացում'}, {'label': 'anxiety', 'name': 'անհանգստություն'}, {'label': 'cold_hands_and_feets', 'name': 'սառը ձեռքեր եւ ոտքեր'}, {'label': 'mood_swings', 'name': 'տրամադրության տատանումներ'}, {'label': 'weight_l

In [None]:

import pickle
import tablib

# load models
loaded_model = pickle.load(open("finalized_model.sav", "rb"))
tfidf_vectorizer_model = pickle.load(open("tfidf_vectorizer_model.sav", "rb"))
multilabel_binarizer_model = pickle.load(open("multilabel_binarizer_model.sav", "rb"))

# load disease description
ds = tablib.Dataset()
ds.csv = open("data/clean/disease_description.csv").read()
disease_description = dict(ds)

# methods
def get_symtoms(text):
    vec = tfidf_vectorizer_model.transform([text])
    pred = loaded_model.predict(vec)
    return multilabel_binarizer_model.inverse_transform(pred)


# routes
text = "հետ տալ"
predicted_symptoms = get_symtoms(text)[0]

models = list()
for i in predicted_symptoms:
    models.append( {
        "label" : i.replace(" ", ""),
        "name" : disease_description[i.replace(" ", "")]
    })
print(models)

