# Metadata

```yaml
course:   DS 5001 
topic:    Generating Data Files
author:   Andrew Chaphiv (acgq2@virginia.edu)
date:    SPR2023
```


# Importing Modules

In [35]:
import pandas as pd 
import numpy as np
import nltk
import re 

# Subsetting DataFrame

In [1]:
# There was a bunch of nonenglish articles present in the initial files, so need to get rid of those 
df = pd.read_csv("condensedabstracts.csv", index_col = 0)
non_english = df.Authors.str.match("\[(Article\s+in\s+[A-Za-z-]+)\]").to_frame() # Some articles are not in english, need to discard.
non_english = non_english[non_english["Authors"] == True].index
chinese = df.Authors.str.match("[Article in Chinese; Abstract available in Chinese from the publisher]").to_frame()
chinese = chinese[chinese["Authors"] == True].index
df = df.drop(chinese)
chinese = df.Authors.str.find("[Article in Chinese; Abstract available in Chinese from the publisher]").to_frame()
chinese = chinese[chinese["Authors"] != -1].index
df = df.drop(chinese)
df = df.drop(non_english)

df = df.reset_index(inplace=False, drop = True)
df.index.names = ["abstract_num"] 

df.to_csv("abstracts-LIB.csv")

NameError: name 'pd' is not defined

# Tokenizing

In [8]:
PARAS = df["Abstract"].to_frame('para_str')
PARAS

Unnamed: 0_level_0,para_str
abstract_num,Unnamed: 1_level_1
0,Interleukin-6 (IL-6) has previously been shown...
1,The goal of screening programmes for cancer is...
2,Non-parametric methods have recently been prop...
3,The observation that charcoal-treated fetal bo...
4,A new de-N-acetylated glycosphingolipid termed...
...,...
9793,OBJECTIVE: To characterize the supportive care...
9794,INTRODUCTION: Outcomes in colorectal cancer tr...
9795,OBJECTIVES: Chronic thromboembolic pulmonary h...
9796,BACKGROUND Robot-assisted radical prostatectom...


In [18]:
SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame('sent_str')
SENTS.index.names = ["abstract_num", "sent_num"]

In [20]:
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,sent_str
abstract_num,sent_num,Unnamed: 2_level_1
0,0,Interleukin-6 (IL-6) has previously been shown...
0,1,"However, the mechanisms leading to increased I..."
0,2,We have studied the effects of synthetic ceram...
0,3,The synthetic ceramides C2- and C6-ceramide as...
0,4,We propose that the sphingomyelin pathway is p...
...,...,...
9797,10,Histopathology identified high-grade myxofibro...
9797,11,Postoperative intensity-modulated radiation th...
9797,12,The patient had greatly improved neurological ...
9797,13,CONCLUSIONS We reported a case of an unresecta...


In [34]:
TOKENS = SENTS.sent_str\
            .apply(lambda x: pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x))))\
            .stack()\
            .to_frame('pos_tuple')

In [36]:
TOKENS.index.names = ["abstract_num", "sent_num", "token_num"]
TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
TOKENS['term_str'] = TOKENS.token_str.str.lower()

In [37]:
TOKENS = TOKENS[TOKENS.term_str != '']
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pos_tuple,pos,token_str,term_str
abstract_num,sent_num,token_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,"(Interleukin-6, JJ)",JJ,Interleukin-6,interleukin-6
0,0,1,"((IL-6), NN)",NN,(IL-6),(il-6)
0,0,2,"(has, VBZ)",VBZ,has,has
0,0,3,"(previously, RB)",RB,previously,previously
0,0,4,"(been, VBN)",VBN,been,been
...,...,...,...,...,...,...
9797,14,28,"(the, DT)",DT,the,the
9797,14,29,"(size,, NN)",NN,"size,","size,"
9797,14,30,"(location,, NN)",NN,"location,","location,"
9797,14,31,"(or, CC)",CC,or,or


In [38]:
TOKENS.to_csv("abstracts-CORPUS.csv")

# Generating Vocab Table

In [39]:
def bag_words(df, bag = ["sent_num"]):
    bow = df.groupby(bag+['term_str']).term_str.count().to_frame('n')
    return bow

def TFIDF(BOW, tf_method = 'sum'):
    #sum, max, log, double_norm, raw, binary
    DTCM = BOW.n.unstack().fillna(0).astype('int')
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log2(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    TF = TF.T
    N = DTCM.shape[0]
    DF = DTCM.astype('bool').sum()
    IDF = np.log2(N / DF)
    TFIDF = TF * IDF
    BOW['tf'] = TF.stack()
    BOW['tfidf'] = TFIDF.stack()
    VOCAB['df'] = DF
    VOCAB['idf'] = IDF
    return TFIDF

In [40]:
VOCAB = TOKENS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['max_pos'] = TOKENS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['cat_pos'] = TOKENS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)

In [41]:
bow = bag_words(TOKENS)
tfidf = TFIDF(bow, tf_method = 'max')
VOCAB['tfidf_mean'] = bow.groupby('term_str').tfidf.mean() #TFIDF[TFIDF > 0].mean().fillna(0) # EXPLAIN
VOCAB['tfidf_sum'] = tfidf.sum()
VOCAB['tfidf_median'] = bow.groupby('term_str').tfidf.median() #TFIDF[TFIDF > 0].median().fillna(0) # EXPLAIN
VOCAB['tfidf_max'] = tfidf.max()
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf

In [42]:
VOCAB.to_csv("abstracts-VOCAB.csv")