In [28]:
import nltk
import re
import os
import pandas as pd
from dataclasses import dataclass, field
from collections import OrderedDict

In [3]:
lexicon_path = './COVID-Twitter-Symptom-Lexicon.txt'
base_lexicon = pd.read_csv(lexicon_path, delimiter='\t', header=None)
base_lexicon.columns = ['symptom', 'cui', 'expression']
base_lexicon

Unnamed: 0,symptom,cui,expression
0,"Anxiety, stress & general mental health symptoms",C1832070,Anxiety
1,"Anxiety, stress & general mental health symptoms",C1832070,worry
2,"Anxiety, stress & general mental health symptoms",C1832070,stressed
3,"Anxiety, stress & general mental health symptoms",C1832070,anxious
4,"Anxiety, stress & general mental health symptoms",C1832070,panic attacks
...,...,...,...
663,Nasal dryness,C0231919,very dry nose
664,Rash,C0015230,rash in elbow
665,Rash,C0015230,rash all over my body
666,Lymphadenopathy,C0497156,Lymph nodes swollen


In [25]:
class Lexicon:
    def __init__(self, base_txt_path):
        self.base_lexicon = self._load_base_lexicon(base_txt_path)
        self.symptom_dict = {}
        self._build_symptom_dict_from_base()
        self.vocabulary = {}
        self._build_vocabulary()

    def _build_symptom_dict_from_base(self):
        # get array of unique cuis
        cui_array = self.base_lexicon.cui.unique()

        # iterate over cui array
        for cui in cui_array:
            # get sub dataframe from the base lexicon
            symptom_df = self.base_lexicon[self.base_lexicon.cui == cui]

            # get symptom name
            symptom_name = symptom_df.symptom.unique()[0]

            # get list of unique expressions for the symptom
            expression_list = list(symptom_df.expression.unique())

            # assign symptom object to symptom dictionary with cui as key
            symptom_object = Symptom(symptom_name, cui, expression_list)
            symptom_object.get_tokens()
            self.symptom_dict[cui] = symptom_object

    def _build_vocabulary(self):
        # initialize empty list
        full_token_list = []

        # iterate over symptoms, appending tokens to the full_token_list
        for symptom_object in self.symptom_dict.values():
            full_token_list += symptom_object.token_list
        
        # save set of full token list to self.vocabulary
        self.vocabulary = list(set(full_token_list))

    def _build_symptom_count_vectors(self):
        for symptom_object in self.symptom_dict.values():
            symptom_object.build_count_vector(self.vocabulary)

    def _build_total_count_vector(self):
        # build ordered dict from vocabulary list
        count_dict = OrderedDict((word, 0) for word in self.vocabulary)

        # iterate over tokens and increment count dict
        for token in self.token_list:
            count_dict[token] += 1

    # def get_expression_tokens(self, target_symptom: str or None = None, preprocess: bool = False):
    #     # if no target symptom is specified
    #     if target_symptom is None:
    #         # get nested list of expressions for each symptom
    #         packed_expression_list = [s.expression_list for s in self.symptom_dict.values()]

    #         # unpack nested list
    #         unpacked_expression_list = [s for s in list(zip(*packed_expression_list))[0]]


    #     else:
    #         # tokenize expressions for given cui
    #         unpacked_expression_list = self.symptom_dict.get(target_symptom)
    #         assert unpacked_expression_list is not None, "unrecognized cui !!!"
        

        # expression_token_list = []
        # # tokenize lists
        # for expression_str in unpacked_expression_list:
        #     expression_token_list += nltk.tokenize.word_tokenize(expression_str)

        # if preprocess:
        #     print('implement this first :P')
        #     # apply preprocessing to tokens

        # return expression_token_list
        

    def _preprocess_tokens(self, expression_list):
        print('also implement this !!!')
        # stem tokens
        # stemmer = nltk.stem.PorterStemmer()
        # stemmed_tokens = [stemmer.stem(t) for t in cleaned_tokens]


    def _load_base_lexicon(self, lexicon_path):
        base_lexicon = pd.read_csv(lexicon_path, delimiter='\t', header=None)
        base_lexicon.columns = ['symptom', 'cui', 'expression']
        return base_lexicon

# @dataclass
class Symptom:
    def __init__(self, name: str, cui: str, expression_list: list[str] = []):
        self.name = name
        self.cui = cui
        self.expression_list = expression_list
        self.token_list = []
        self.token_count_vector = []
        self.get_tokens()

    def get_tokens(self, output: bool = False):
        # reset token list
        self.token_list = []

        # iterate over expressions
        for expression_str in self.expression_list:
            # tokenize each and add to token list
            self.token_list += nltk.tokenize.word_tokenize(expression_str)
        
        # if output is True, return self.token_list
        if output:
            return self.token_list
        
    def build_count_vector(self, vocabulary):
        # build ordered dict from vocabulary list
        count_dict = OrderedDict((word, 0) for word in vocabulary)

        # iterate over tokens and increment count dict
        for token in self.token_list:
            count_dict[token] += 1

    



In [26]:
test_lexicon = Lexicon(lexicon_path)
test_lexicon.symptom_dict

{'C1832070': <__main__.Symptom at 0x7fb98a1c9d00>,
 'C0008031': <__main__.Symptom at 0x7fb9792242e0>,
 'C0010200': <__main__.Symptom at 0x7fb9792d3f10>,
 'C0013404': <__main__.Symptom at 0x7fb9791ee1f0>,
 'C0235710': <__main__.Symptom at 0x7fb9791ee130>,
 'C0015672': <__main__.Symptom at 0x7fb98a1c9f40>,
 'C2363731': <__main__.Symptom at 0x7fb98a1c9e20>,
 'C0030252': <__main__.Symptom at 0x7fb98a1c9550>,
 'C0012833': <__main__.Symptom at 0x7fb98a1c9df0>,
 'C0018681': <__main__.Symptom at 0x7fb98a1c9b50>,
 'C0948596': <__main__.Symptom at 0x7fb979224520>,
 'C0149931': <__main__.Symptom at 0x7fb98a1c9e50>,
 'C0015967': <__main__.Symptom at 0x7fb98a1c9100>,
 'C0085593': <__main__.Symptom at 0x7fb98a1c9430>,
 'C0027497': <__main__.Symptom at 0x7fb98a1375b0>,
 'C0151827': <__main__.Symptom at 0x7fb97923b670>,
 'C0232462': <__main__.Symptom at 0x7fb98a1c9be0>,
 'C0003126': <__main__.Symptom at 0x7fb98a1c9f10>,
 'C2364111': <__main__.Symptom at 0x7fb98a137f70>,
 'C1260880': <__main__.Symptom 

In [27]:
test_expr_tokens = test_lexicon.get_expression_tokens()
test_expr_tokens[:10]

['Anxiety',
 'pain',
 'with',
 'my',
 'lungs',
 'coughing',
 'out',
 'of',
 'breath',
 'tight']

In [24]:
token_freq_dist = nltk.FreqDist(test_expr_tokens)
token_freq_dist.most_common(20)

[('in', 3),
 ('pain', 2),
 ('my', 2),
 ('of', 2),
 ('pressure', 2),
 ('could', 2),
 ('not', 2),
 ('nose', 2),
 ('aches', 2),
 ('blood', 2),
 ('infection', 2),
 ('Anxiety', 1),
 ('with', 1),
 ('lungs', 1),
 ('coughing', 1),
 ('out', 1),
 ('breath', 1),
 ('tight', 1),
 ('chest', 1),
 ('super', 1)]

In [None]:
test_symptom.expression_list += ['feels bad', 'feels rly bad']

In [None]:
"""
# possible approach

# convert all expressions to tf-idf representation
instead use category frequency instead of tf
in other words, get count vector of all expressions in category A, then divide each value by the count vectors in all categories

"""