# Load Dataset

In [6]:
import pandas as pd
from pathlib import Path

usedcols = ['sentence', 'term1', 'term2']

df_caus = pd.read_csv(
    Path('..', 'data', 'crowd_truth_cause.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usedcols
)
df_caus["is_cause"] = 1
df_caus["is_treat"] = 0
df_treat = pd.read_csv(
    Path('..', 'data', 'crowd_truth_treat.csv'),
    sep=',', quotechar='"',
    skipinitialspace=True,
    encoding='utf-8',
    on_bad_lines='skip',
    usecols=usedcols
)
df_treat["is_treat"] = 1
df_treat["is_cause"] = 0
df = df_caus.append(df_treat, ignore_index=True)
df


  df = df_caus.append(df_treat, ignore_index=True)


Unnamed: 0,term1,term2,sentence,is_cause,is_treat
0,AUTISM,TANTRUM,"The limited data suggest that, in children wit...",1,0
1,SLEEP PROBLEM,FAMILY STRESS,SLEEP PROBLEMs are associated with difficult b...,1,0
2,CEREBELLAR ATAXIA,DYSFUNCTION OF THE CEREBELLUM,The term CEREBELLAR ATAXIA is employed to indi...,1,0
3,CEREBELLAR DEGENERATION,CHRONIC ETHANOL ABUSE,Non hereditary causes of cerebellar degenerati...,1,0
4,HEART PROBLEM,ARTHRITIS,The disorder can present with a migratory ture...,1,0
...,...,...,...,...,...
7963,PARKINSON'S DISEASE,AMANTADINE,A 61 year old man with PARKINSON'S DISEASE (PD...,0,1
7964,DEPRESSION,IMIPRAMINE,With successful treatment of the patient's dep...,0,1
7965,ANGI,BEPRIDIL,Five of 15 patients receiving bepridil did not...,0,1
7966,HEMOPHILIA A,FACTOR VIII,The development of antibodies to factor VIII i...,0,1


# Preprocessing

In [7]:
# Make case insensitive (no loss because emphasis on words does not play a role)
df['sentence'] = df['sentence'].map(lambda x: x.lower())
# Replace entities in sentence with placeholder tokens (may be useful for generalization when using n-grams)
df['sentence'] = df.apply(lambda x: x['sentence'].replace(x['term1'].lower(), 'TERM_ONE'), axis=1)
df['sentence'] = df.apply(lambda x: x['sentence'].replace(x['term2'].lower(), 'TERM_TWO'), axis=1)

for i in range(5):
    print(df['sentence'][i])

df = df[df['sentence'].apply(lambda x: 'TERM_ONE' in x and 'TERM_TWO' in x)]

print(f"Number of docs: {len(df)}")

the limited data suggest that, in children with mental retardation, TERM_ONE is associated with aggression, destruction of property, and TERM_TWOs.
TERM_ONEs are associated with difficult behaviors and TERM_TWO, and are often a focus of clinical attention over and above the primary asd diagnosis.
the term TERM_ONE is employed to indicate ataxia that is due to TERM_TWO
non hereditary causes of TERM_ONE include TERM_TWO, paraneoplastic TERM_ONE, high altitude cerebral oedema, coeliac disease, normal pressure hydrocephalus and cerebellitis.
the disorder can present with a migratory ture of TERM_TWO with many other features like TERM_ONEs, skin rash, gait abnormality and skin nodules.
Number of docs: 7821


In [8]:
# Convert labels to right dtype
label_cols = ['is_cause', 'is_treat']
df['is_cause'] = df['is_cause'].astype(float).astype(int)
df['is_treat'] = df['is_treat'].astype(float).astype(int)
df[label_cols].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_cause'] = df['is_cause'].astype(float).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_treat'] = df['is_treat'].astype(float).astype(int)


Unnamed: 0,is_cause,is_treat
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [9]:
import nltk
nltk.download('punkt') # for tokanization
nltk.download('stopwords') # for stopword removal

# Tokenize the sentences
df['tokens'] = df['sentence'].apply(lambda x: nltk.word_tokenize(x))
# Remove stop words and tokens with length smaller than 2 (i.e. punctuations)
df['tokens'] = df['tokens'].apply(lambda x: [token for token in x if token not in nltk.corpus.stopwords.words('english') and len(token) > 1])
# Perform stemming
porter = nltk.PorterStemmer()
df['tokens_stem'] = df['tokens'].apply(lambda x: [porter.stem(token) for token in x])
for i in range(5):
    print(df['tokens_stem'][i])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['sentence'].apply(lambda x: nltk.word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df['tokens'].apply(lambda x: [token for token in x if token not in nltk.corpus

['limit', 'data', 'suggest', 'children', 'mental', 'retard', 'term_on', 'associ', 'aggress', 'destruct', 'properti', 'term_two']
['term_on', 'associ', 'difficult', 'behavior', 'term_two', 'often', 'focu', 'clinic', 'attent', 'primari', 'asd', 'diagnosi']
['term', 'term_on', 'employ', 'indic', 'ataxia', 'due', 'term_two']
['non', 'hereditari', 'caus', 'term_on', 'includ', 'term_two', 'paraneoplast', 'term_on', 'high', 'altitud', 'cerebr', 'oedema', 'coeliac', 'diseas', 'normal', 'pressur', 'hydrocephalu', 'cerebel']
['disord', 'present', 'migratori', 'ture', 'term_two', 'mani', 'featur', 'like', 'term_on', 'skin', 'rash', 'gait', 'abnorm', 'skin', 'nodul']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens_stem'] = df['tokens'].apply(lambda x: [porter.stem(token) for token in x])


In [10]:
# Dependencies for WorNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

# Perform lemmatization
lemmatizer = nltk.stem.WordNetLemmatizer()
df['tokens_lemma'] = df['tokens_stem'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])
for i in range(5):
    print(df['tokens_lemma'][i])

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ana\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['limit', 'data', 'suggest', 'child', 'mental', 'retard', 'term_on', 'associ', 'aggress', 'destruct', 'properti', 'term_two']
['term_on', 'associ', 'difficult', 'behavior', 'term_two', 'often', 'focu', 'clinic', 'attent', 'primari', 'asd', 'diagnosi']
['term', 'term_on', 'employ', 'indic', 'ataxia', 'due', 'term_two']
['non', 'hereditari', 'caus', 'term_on', 'includ', 'term_two', 'paraneoplast', 'term_on', 'high', 'altitud', 'cerebr', 'oedema', 'coeliac', 'diseas', 'normal', 'pressur', 'hydrocephalu', 'cerebel']
['disord', 'present', 'migratori', 'ture', 'term_two', 'mani', 'featur', 'like', 'term_on', 'skin', 'rash', 'gait', 'abnorm', 'skin', 'nodul']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens_lemma'] = df['tokens_stem'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])


# time expression recognition
standalone to python-time