In [1]:
import numpy as np
import pandas as pd
import re


In [2]:
df = pd.read_csv('SymptomsListN.csv', encoding='cp1252')
df.head(-20)


Unnamed: 0,SymptomList
0,(honeycomb) opacities of cornea
1,‘burning’ discomfort in the lesions
2,‘Daughter yaws’ are multiple lesions that dev...
3,“swan-neck deformity
4,"“tumor plop,"
...,...
6425,yaws(skin lesion)
6426,yellow crust in eyelids
6427,yellow discolouration of sclera
6428,Yellow skin discoloration


In [3]:
#change in lower case
df['SymptomList']=df['SymptomList'].astype(str)
df["SymptomList"] = df['SymptomList'].apply(lambda x:x.lower())
df


Unnamed: 0,SymptomList
0,(honeycomb) opacities of cornea
1,‘burning’ discomfort in the lesions
2,‘daughter yaws’ are multiple lesions that dev...
3,“swan-neck deformity
4,"“tumor plop,"
...,...
6445,"yellowish white, hard looking, raised areas, v..."
6446,"yellowishred, heavily vascularized nodules nea..."
6447,"yellow-orange discoloration of skin, urine and..."
6448,"yellowwhite, round, solitary, raised nodule, a..."


In [4]:
#Removing html tags
def remove_html_tags(text):
    pattern = re.compile('[<,*?,>]')
    return pattern.sub(r'', text)

In [5]:
df['SymptomList'].apply(remove_html_tags)

0                         (honeycomb) opacities of cornea
1                     ‘burning’ discomfort in the lesions
2       ‘daughter yaws’ are multiple lesions that  dev...
3                                    “swan-neck deformity
4                                             “tumor plop
                              ...                        
6445    yellowish white hard looking raised areas vary...
6446    yellowishred heavily vascularized nodules near...
6447    yellow-orange discoloration of skin urine and ...
6448    yellowwhite round solitary raised nodule about...
6449                                     z-line deformity
Name: SymptomList, Length: 6450, dtype: object

In [6]:
def remove_url(text):
    pattern = re.compile('http?//S+|www.S+')
    return pattern.sub(r'', text)

In [9]:
df['SymptomList'].apply(remove_url)

0                         (honeycomb) opacities of cornea
1                     ‘burning’ discomfort in the lesions
2       ‘daughter yaws’ are multiple lesions that  dev...
3                                    “swan-neck deformity
4                                            “tumor plop,
                              ...                        
6445    yellowish white, hard looking, raised areas, v...
6446    yellowishred, heavily vascularized nodules nea...
6447    yellow-orange discoloration of skin, urine and...
6448    yellowwhite, round, solitary, raised nodule, a...
6449                                     z-line deformity
Name: SymptomList, Length: 6450, dtype: object

In [10]:
import string, time
exclude = string.punctuation
def remove_punct(text):
    for char in exclude:
        text = text.replace(char, '')
        return text.translate(str.maketrans(' ',' ',exclude))

In [11]:
df['SymptomList']=  df['SymptomList'].apply(remove_punct)
df['SymptomList']

0                           honeycomb opacities of cornea
1                     ‘burning’ discomfort in the lesions
2       ‘daughter yaws’ are multiple lesions that  dev...
3                                     “swanneck deformity
4                                             “tumor plop
                              ...                        
6445    yellowish white hard looking raised areas vary...
6446    yellowishred heavily vascularized nodules near...
6447    yelloworange discoloration of skin urine and o...
6448    yellowwhite round solitary raised nodule about...
6449                                      zline deformity
Name: SymptomList, Length: 6450, dtype: object

In [12]:
# Add short/chat words in form of dictionary
chat_words = {1 : 'MRI', 2: 'CT'}
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return ' '.join(new_text)

test = chat_conversion('new MRi, cti, aki, mi, tp')
print(chat_words)

{1: 'MRI', 2: 'CT'}


In [13]:
df['SymptomList'].apply(chat_conversion)

0                           honeycomb opacities of cornea
1                     ‘burning’ discomfort in the lesions
2       ‘daughter yaws’ are multiple lesions that deve...
3                                     “swanneck deformity
4                                             “tumor plop
                              ...                        
6445    yellowish white hard looking raised areas vary...
6446    yellowishred heavily vascularized nodules near...
6447    yelloworange discoloration of skin urine and o...
6448    yellowwhite round solitary raised nodule about...
6449                                      zline deformity
Name: SymptomList, Length: 6450, dtype: object

In [14]:
chat_words

{1: 'MRI', 2: 'CT'}

In [16]:
#Spelling correction
from textblob import TextBlob
incorrect_text = 'INCREASED FREQUENCY OF MICTURATION SINCE 20 DAYS'
textblob = TextBlob(incorrect_text)
textblob.correct()


TextBlob("INCREASED FREQUENCY of MICTURATION SINCE 20 DAYS")

In [17]:
import nltk 
sent = 'undocumented high grade fever with suspected abnormal activities in a developmentally normal child'
tokens = nltk.word_tokenize(sent)

In [18]:
from nltk.corpus import stopwords
sent = 'right side abdominal pain associated with increased after eating x 1 months'

def stop_word_remover(sent):
    tokens = nltk.word_tokenize(sent)
    stop_words = stopwords.words('english')
    
    words_without_stop_words = []
    for word in tokens:
        if word in stop_words:
            continue
        else:
            words_without_stop_words.append(word)
    return words_without_stop_words


    



without_stop_word = stop_word_remover(sent)
without_stop_word
#stop_words
#print(' '.join())

['right',
 'side',
 'abdominal',
 'pain',
 'associated',
 'increased',
 'eating',
 'x',
 '1',
 'months']

In [19]:
#Tokenizing
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
word_tokenize(sent)
sent_tokenize(sent)

['right side abdominal pain associated with increased after eating x 1 months']

In [20]:
word_tokenize(sent)

['right',
 'side',
 'abdominal',
 'pain',
 'associated',
 'with',
 'increased',
 'after',
 'eating',
 'x',
 '1',
 'months']

In [21]:
#Lemmatization (to Root word)
import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
WordNetLemmatizer = WordNetLemmatizer()
sent = 'right side abdominal pain associated with increased after eating x 1 months'
tokens = nltk.word_tokenize(sent)
for word in tokens:
    print(word,'—->', WordNetLemmatizer.lemmatize(word, pos='v'))

right —-> right
side —-> side
abdominal —-> abdominal
pain —-> pain
associated —-> associate
with —-> with
increased —-> increase
after —-> after
eating —-> eat
x —-> x
1 —-> 1
months —-> months


In [22]:
tagged = nltk.pos_tag(tokens)
tagged

[('right', 'JJ'),
 ('side', 'NN'),
 ('abdominal', 'JJ'),
 ('pain', 'NN'),
 ('associated', 'VBN'),
 ('with', 'IN'),
 ('increased', 'VBN'),
 ('after', 'IN'),
 ('eating', 'VBG'),
 ('x', 'JJ'),
 ('1', 'CD'),
 ('months', 'NNS')]

In [23]:
#import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged)
entities

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


ModuleNotFoundError: No module named 'svgling'

Tree('S', [('right', 'JJ'), ('side', 'NN'), ('abdominal', 'JJ'), ('pain', 'NN'), ('associated', 'VBN'), ('with', 'IN'), ('increased', 'VBN'), ('after', 'IN'), ('eating', 'VBG'), ('x', 'JJ'), ('1', 'CD'), ('months', 'NNS')])

In [24]:
#Sentence 

import pandas as pd
from sentence_transformers import SentenceTransformer

# Download model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# The sentences we'd like to encode
sentences = ['couigh 15 days',
    'cough and cold 15 days',
    'cough since 15 days']
sentences = ['Fever Headache for 20 days',
    'fever and hedache 1 Day',
    'fever, headach frm Last 30 days']


#lower the sentence 
#sentences = [sentence.lower() for sentence in sentences]
df = pd.DataFrame(sentences, columns =['SymptomList'])
df['SymptomList'] = df['SymptomList'].astype(str)
df['SymptomList'] = df['SymptomList'].apply(lambda x:x.lower())


#remove html tags
df['SymptomList'] = df['SymptomList'].apply(remove_html_tags)

#remove url
df['SymptomList'] = df['SymptomList'].apply(remove_url)

#remove punc
df['SymptomList'] = df['SymptomList'].apply(remove_punct)


#converting dataframe to list
sentences = df['SymptomList'].to_list()





#correction of word/sentence
#Sectence Correction
sentences = sentences
#sentences = [sentence.lower() for sentence in sentences]
sentences = [ TextBlob(sentence) for sentence in sentences]
result = [ sentence.correct() for sentence in sentences]            
#sentences = TextBlob(sentences)
#result = sentences.correct()

print(type(result))

#short matching 




#Rule based matching



#token & removing stop words
without_stop_word = []
#stp = [ stop_word_remover(sentence) for sentence in result]
for sentence in result:
    print(type(sentence))
    #stp = stop_word_remover(sentence)
    #without_stop_word.append(stp)
    #without_stop_word.append(stop_word_remover(sentence))


#Synonym checking
from nltk.corpus import wordnet

synonyms = []

for syn in wordnet.synsets("abdomen"):
    for i in syn.lemmas():
        synonyms.append(i.name())

print(set(synonyms))

#print(stp)
# Get embeddings of sentences
embeddings = model.encode(result)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

<class 'list'>
<class 'textblob.blob.TextBlob'>
<class 'textblob.blob.TextBlob'>
<class 'textblob.blob.TextBlob'>
{'abdomen', 'abdominal_cavity', 'belly', 'stomach', 'venter'}
Sentence: fever headache for 20 days
Embedding: [ 0.09523411  0.06470332 -0.30507928 -0.2231611  -0.2347271   0.04536242
  1.0894808   0.58303845  0.5594681   0.1355435   0.43327123 -0.17923024
 -0.03225068 -0.25941533 -0.17211775 -0.44776145 -0.18185028 -0.16525367
 -0.68620193 -0.30656338  0.54526937 -0.13851416  0.03838924 -0.07038049
 -0.39278632  0.17501178 -0.16452442  0.15071122 -0.00760711 -0.6226722
  0.25688344  0.07360788  0.4133746   0.08934994  0.2080049   0.11153354
  0.35777813  0.68070173 -0.2646817  -0.1237617  -0.31689304 -0.47309238
  0.29375988  0.62149227  0.28507033  0.1832299   0.10133252  0.21880583
 -0.04922507  0.25708064 -0.28593236  0.0289024  -0.58608544  0.07013474
  0.44660172  0.2146254   0.5933214  -0.22107434 -0.32148576  0.60399747
 -0.04429484 -0.27790517 -0.25415197  0.7035132

In [30]:
#Semantic Textual Similarity
from sentence_transformers import SentenceTransformer, util

# Download model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# The sentences we'd like to compute similarity about
sentences = ['Fracture Of the Olecranon',
    'Fracture Olecranon',
    'fracture of olecranon']
# sentences = ['cough 15 days',
#     'cough and cold 15 days',
#     'cough since 15 days']

# Get embeddings of sentences
embeddings = model.encode(sentences)

# Compute similarities
sim = util.cos_sim(embeddings[0], embeddings[1])
print("comparing of 1st with 2nd :->","{0:.4f}".format(sim.tolist()[0][0])) 
sim = util.cos_sim(embeddings[0], embeddings[2])
print("comparing of 1st with 3rd :->","{0:.4f}".format(sim.tolist()[0][0])) 
sim = util.cos_sim(embeddings[-1], embeddings[1])
print("comparing of 3rd with 2nd :->","{0:.4f}".format(sim.tolist()[0][0])) 

comparing of 1st with 2nd :-> 0.9642
comparing of 1st with 3rd :-> 0.9861
comparing of 3rd with 2nd :-> 0.9855


In [153]:
#Lemmatization (to Root word)
import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
WordNetLemmatizer = WordNetLemmatizer()

In [154]:
#Disease List pre-process
df = pd.read_csv('disease_list.csv', encoding='cp1252')
df.head(-20)

Unnamed: 0,Disease Name
0,11ß-Hydroxylase deficiency
1,"17a Hydroxylase/17,20- lyase deficiency"
2,17-hydroxycorticosteroid
3,"2,4 Dienoyl-CoA Reductase deficiency"
4,"2,4-Dienoyl-CoA reductase"
...,...
3150,White sponge naevus
3151,WHOOPING COUGH
3152,Williams syndrome
3153,WILMS TUMOR


In [155]:
#change in lower case
df['Disease Name']=df['Disease Name'].astype(str)
df["Disease Name"] = df['Disease Name'].apply(lambda x:x.lower())
df

Unnamed: 0,Disease Name
0,11ß-hydroxylase deficiency
1,"17a hydroxylase/17,20- lyase deficiency"
2,17-hydroxycorticosteroid
3,"2,4 dienoyl-coa reductase deficiency"
4,"2,4-dienoyl-coa reductase"
...,...
3170,yolk sac tumour: polyvesicular vitelline tumou...
3171,zellweger syndrome
3172,zollinger- ellison syndrome
3173,zoon’s vulvitis


#Disease and Organ finder

In [12]:
#symptoms_plus_disease = 'right side abdominal hernias pain associated with increased after eating x 1 months and fever'
#symptoms_plus_disease = 'complaints of itchy lesions over the legs and below the right buttock, hand, leg, on abdomen'
symptoms_plus_disease = 'itching both eye since 1 yr'
symptoms_plus_disease = symptoms_plus_disease.lower()
symptoms_plus_disease = remove_html_tags(symptoms_plus_disease)
symptoms_plus_disease = remove_url(symptoms_plus_disease)
symptoms_plus_disease = remove_punct(symptoms_plus_disease)
symptoms_plus_disease = stop_word_remover(str(symptoms_plus_disease))
dr_transcription = []
for word in symptoms_plus_disease:
    lem_word = WordNetLemmatizer.lemmatize(word, pos='v')
    dr_transcription.append(lem_word)
#symptoms_plus_disease_wtokenize = word_tokenize(symptoms_plus_disease)
#symptoms_plus_disease_stokenize = sent_tokenize(symptoms_plus_disease)
print(dr_transcription)
#symptoms_plus_disease, 

NameError: name 'remove_url' is not defined

In [200]:
#checking for disease
print(dr_transcription)
def checking_disease(dr_transcription):
    dr_transcription = dr_transcription
    embeddings = model.encode(dr_transcription)

    # Compute similarities
    sim = util.cos_sim(embeddings[0], embeddings[1])
    #print("comparing of 1st with 2nd :->","{0:.4f}".format(sim.tolist()[0][0])) 
    #print(dr_transcription)
    #tokens = nltk.word_tokenize(disease)
    diseases_list = ['fever', 'achondroplasia', 'abdominal hernias','hernias' 'pain', 'hernias', 'itchy', 'itch']
    for disease in dr_transcription:
        if disease in diseases_list:
            return disease
        else:
            return "Not Found"
    



disease = checking_disease(dr_transcription)
print("Disease :->", disease)


['itch', 'eye', 'since', '1', 'yr']
Disease :-> itch


In [189]:
#Disease Matching
from hamcrest import none
embeddings = model.encode(result)



diseases_list = ['fever', 'achondroplasia', 'abdominal hernias', 'pain', 'hernias', 'itchy', 'itch']
dr_transcription = dr_transcription
# Print the embeddings
for sentence, embedding in zip(dr_transcription, embeddings):
    print("Word:", sentence)
    #print("Embedding:", embedding)
    print("")
matched_disease = []
for i in dr_transcription:
    if i in diseases_list:
        matched_disease.append(i)
        continue
    else:
        print(none)


print("Matched Disease :->", matched_disease)
embedding_disease = model.encode(diseases_list)
#print(embedding_disease)
#sim = model.encode(dr_transcription)
#print("comparing of 1st with 2nd :->","{0:.4f}".format(sim.tolist()[0][0])) 

Word: itch

Word: eye

Word: since

<function none at 0x0000013FAE69EB80>
<function none at 0x0000013FAE69EB80>
<function none at 0x0000013FAE69EB80>
<function none at 0x0000013FAE69EB80>
Matched Disease :-> ['itch']


In [1]:
body_organ = pd.read_csv('body_organ.csv')
body_organ.head()

NameError: name 'pd' is not defined

In [196]:
body_organ['Organ'] = body_organ['Organ'].astype(str)
body_organ['Organ'] = body_organ['Organ'].apply(lambda x :x.lower())

In [197]:
body_organ = body_organ['Organ'].to_list()

In [198]:
#checking  body organ
print(dr_transcription)
def check_body_organ(dr_transcription,body_organ):
    dr_transcription = dr_transcription
    body_organ_list = body_organ
    print(body_organ_list)
    matched_body_organ = []
    for i in dr_transcription:
        if i in body_organ_list:
            #print(i)
            matched_body_organ.append(i)
            continue
        else:
            print(none)
    print("Organ :->", matched_body_organ)
bd_organ = check_body_organ(dr_transcription,body_organ)
#print("Organ :", bd_organ)

['itch', 'eye', 'since', '1', 'yr']
['abdomen', 'abdomen (right hypochondium)', 'abdomen (right lumber)', 'abdomen (right eliac region)', 'abdomen (left hypochondium)', 'abdomen (left lumber)', 'abdomen (left eliac region)', 'abdomen (epigastric region)', 'abdomen (umbilical region)', 'abdomen (hypogastrium)', 'ankle', 'back', 'back (lower)', 'back (upper)', 'breast', 'buttock', 'calf', 'chest', 'ear', 'elbow', 'forehead', 'eye', 'face', 'finger', 'foot', 'hair', 'hand', 'head', 'heel', 'hip', 'knee', 'leg', 'lips', 'mouth', 'nail', 'neck', 'nose', 'palm', 'pelvis', 'shin', 'shoulder', 'skin', 'teeth', 'thigh', 'throat', 'thumb', 'toe', 'waist', 'wrist', 'full body', 'anywhere', 'vagina', 'penis', 'nipple', 'heart', 'kidney', 'lungs', 'liver', 'brain', 'bladder', 'stomach', 'intestines', 'uterus', 'blood', 'overy', 'testicles', 'limbs', 'spine', 'gollbladder', 'skull', 'gums', 'parotid gland', 'upper extremities', 'lower extremities', 'not specified', 'salivary glands', 'cartilagenous'

In [192]:
#Sentence Embedding
import pandas as pd
from sentence_transformers import SentenceTransformer

# Download model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# The sentences we'd like to encode
sentences = ['couigh 15 days',
    'cough and cold 15 days',
    'cough since 15 days']
sentences = ['Fever Headache for 20 days',
    'fever and hedache 1 Day',
    'fever, headach frm Last 30 days']


#lower the sentence 
#sentences = [sentence.lower() for sentence in sentences]
df = pd.DataFrame(sentences, columns =['SymptomList'])
df['SymptomList'] = df['SymptomList'].astype(str)
df['SymptomList'] = df['SymptomList'].apply(lambda x:x.lower())


#remove html tags
df['SymptomList'] = df['SymptomList'].apply(remove_html_tags)

#remove url
df['SymptomList'] = df['SymptomList'].apply(remove_url)

#remove punc
df['SymptomList'] = df['SymptomList'].apply(remove_punct)


#converting dataframe to list
sentences = df['SymptomList'].to_list()





#correction of word/sentence
#Sectence Correction
sentences = sentences
#sentences = [sentence.lower() for sentence in sentences]
sentences = [ TextBlob(sentence) for sentence in sentences]
result = [ sentence.correct() for sentence in sentences]            
#sentences = TextBlob(sentences)
#result = sentences.correct()

print(type(result))

#short matching 




#Rule based matching



#token & removing stop words
without_stop_word = []
#stp = [ stop_word_remover(sentence) for sentence in result]
for sentence in result:
    print(type(sentence))
    #stp = stop_word_remover(sentence)
    #without_stop_word.append(stp)
    #without_stop_word.append(stop_word_remover(sentence))


#Synonym checking
from nltk.corpus import wordnet

synonyms = []

for syn in wordnet.synsets("abdomen"):
    for i in syn.lemmas():
        synonyms.append(i.name())

print(set(synonyms))

#print(stp)
# Get embeddings of sentences
embeddings = model.encode(result)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

<class 'list'>
<class 'textblob.blob.TextBlob'>
<class 'textblob.blob.TextBlob'>
<class 'textblob.blob.TextBlob'>
{'abdomen', 'venter', 'abdominal_cavity', 'stomach', 'belly'}
Sentence: fever headache for 20 days
Embedding: [ 0.09523411  0.06470332 -0.30507928 -0.2231611  -0.2347271   0.04536242
  1.0894808   0.58303845  0.5594681   0.1355435   0.43327123 -0.17923024
 -0.03225068 -0.25941533 -0.17211775 -0.44776145 -0.18185028 -0.16525367
 -0.68620193 -0.30656338  0.54526937 -0.13851416  0.03838924 -0.07038049
 -0.39278632  0.17501178 -0.16452442  0.15071122 -0.00760711 -0.6226722
  0.25688344  0.07360788  0.4133746   0.08934994  0.2080049   0.11153354
  0.35777813  0.68070173 -0.2646817  -0.1237617  -0.31689304 -0.47309238
  0.29375988  0.62149227  0.28507033  0.1832299   0.10133252  0.21880583
 -0.04922507  0.25708064 -0.28593236  0.0289024  -0.58608544  0.07013474
  0.44660172  0.2146254   0.5933214  -0.22107434 -0.32148576  0.60399747
 -0.04429484 -0.27790517 -0.25415197  0.7035132

In [193]:
#checking for disease
def checking_disease(symptoms_plus_disease):
    disease = symptoms_plus_disease
    tokens = nltk.word_tokenize(disease)
    diseases_list = ['fever', 'achondroplasia']
    if disease in diseases_list:
        return disease
    else:
        return "Not Found"
    



disease = checking_disease('fever -3days')
print(disease)

Not Found


In [44]:
#checking  body organ
def body_organ(dr_transcription):
    body_organ = symptoms_plus_disease
    dr_transcription = ['right', 'side', 'abdominal', 'hernias', 'pain', 'associate', 'increase', 'eat', 'x', '1', 'months', 'fever']
    body_organ_list = []
    if body_organ in body_organ_list:
        return body_organ
    else:
        return "Not Found any body organ"

In [1]:
symptoms_list = pd.read_csv('symptomsListN.csv')

NameError: name 'pd' is not defined

In [20]:
import requests
test_api =  'http://182.156.200.179:332/api/v1.0/Knowmed/getAllProblemList'
#payload = {"alphabet": "a"}
test_response = requests.post(test_api, json={"alphabet": ""})
response_data = test_response.json()
mydict = {}
problemNames = {}
for i in range(len(response_data['responseValue'])):
    #print(response_data['responseValue'][i]['problemName'])
    #problemNames.append(response_data['responseValue'][i]['id'])
    t = {response_data['responseValue'][i]['id']:response_data['responseValue'][i]['problemName'].lower()}
    problemNames.update(t)
    #x = [{id: id, 'problemName': problemName} for id, problemName in mydict.items() if problemName == search_value]




In [21]:
search_value = 'fever'
tx = [{key:value} for key, value in problemNames.items() if value == search_value]
tx

[{7743: 'fever'}]

In [None]:
import requests
test_api =  'http://182.156.200.179:332/api/v1.0/Knowmed/getAllProblemList'
payload = {"alphabet": "a"}
test_response = requests.post(test_api, json={"alphabet": ""})
response_data = test_response.json()
print(response_data)

In [3]:
%pip install rake-nltk


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


In [8]:
from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters by
# default
r = Rake()
dr_transcription = 'COUGH DRY 1.5 MONTHS CHEST PAIN ON COUGHING FEVER WT LOSS'
# Extraction given the text.
r.extract_keywords_from_text(dr_transcription)

# Extraction given the list of strings where each string is a sentence.
#r.extract_keywords_from_sentences('COUGH DRY 1.5 MONTHS CHEST PAIN ON COUGHING FEVER WT LOSS')

# To get keyword phrases ranked highest to lowest.
r.get_ranked_phrases()

# To get keyword phrases ranked highest to lowest with scores.
r.get_ranked_phrases_with_scores()

[(16.0, 'coughing fever wt loss'),
 (16.0, '5 months chest pain'),
 (9.0, 'cough dry 1')]

In [10]:
import nltk
word2count = {}
dataset = 'COUGH DRY 1.5 MONTHS CHEST PAIN ON COUGHING FEVER WT LOSS'
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [11]:
word2count

{'C': 3,
 'O': 5,
 'U': 2,
 'G': 3,
 'H': 4,
 'D': 1,
 'R': 2,
 'Y': 1,
 '1': 1,
 '.': 1,
 '5': 1,
 'M': 1,
 'N': 4,
 'T': 3,
 'S': 4,
 'E': 3,
 'P': 1,
 'A': 1,
 'I': 2,
 'F': 1,
 'V': 1,
 'W': 1,
 'L': 1}

In [None]:
#pip install pytextrank