In [None]:
import os
import pandas as pd
import csv
import re
import sys
import math
import nltk
import string
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Reading the Data

In [None]:
# Load the data
data = pd.read_csv("../resources/processed-goldstandard-XMLTXT.tsv", sep="\t", encoding="utf-8")
data.shape

# Functions to tokenize, remove stop words, get stemms

In [None]:
# Get Stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopWords = stopwords.words('english')

def tokenizePorter(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = PorterStemmer()
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

def tokenizeSnowball(text):
    tokens = word_tokenize(text)
    stems = []
    stemmer = SnowballStemmer("english")
    for item in tokens:
        if item not in stopWords: 
            stems.append(stemmer.stem(item))
    return ' '.join(stems)

# Preprocessing the data

In [None]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

# Preprocessing the Text
removePunctuation = str.maketrans('\n', ' ', string.punctuation)

# Transforms the text to lower case, remove punctuations, get the stemms of words 
data['title_abstract_mesh'] = data[['title', 'abstract', "major_mesh", "minor_mesh"]].apply(lambda x: ''.join(re.sub(r';|\/', ' ', x.to_string(index=False).lower()).translate(removePunctuation)), axis=1)
data['title_abstract_mesh_stemmed'] = data['title_abstract_mesh'].apply(tokenizeSnowball)

# Splitting Sets First into PM and not PM

## PM dataset

In [None]:
pmSet = data[data['pm_rel_desc'].str.contains('Human PM|Animal PM', regex=True)]
pmSet.shape

## Not PM dataset

In [None]:
notPmSet = data[data['pm_rel_desc'].str.contains('Not PM', regex=True)]
notPmSet.shape

## TFIDF weighting

In [None]:
def tfidfMeanWeight(data):
    tvec = TfidfVectorizer()
    tvecWeights = tvec.fit_transform(data['title_abstract_mesh_stemmed'])

    weights = np.asarray(tvecWeights.mean(axis=0)).ravel().tolist()
    weightsDf = pd.DataFrame({'term': tvec.get_feature_names(), 'weight': weights})
    return weightsDf

In [None]:
weightsPM = tfidfMeanWeight(pmSet)
topPM = weightsPM.sort_values(by='weight', ascending=False).head(10)
topPM

In [None]:
weightsNotPM = tfidfMeanWeight(notPmSet)
topNotPM = weightsNotPM.sort_values(by='weight', ascending=False).head(10)
topNotPM

In [None]:
mergedOnlyPM = pd.merge(topPM, topNotPM, on="term", how="outer", suffixes=["_pm", "_notpm"])
mergedOnlyPM

# Splitting DataSets Later

In [None]:
# Based on: https://buhrmann.github.io/tfidf-analysis.html

def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_mean_feats(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=100):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)

def top_feats_by_class(Weights, data, features, min_tfidf=0.1, top_n=100):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []
    labels = np.unique(data)
    for label in labels:
        ids = np.where(data==label)
        feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs

def top_feats_pm_notpm(Weights, data, features, min_tfidf=0.1, top_n=50):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = []

    ids = np.where(data=="Human PM") or np.where(data=="Animal PM")
    feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = "PM"
    dfs.append(feats_df)

    ids = np.where(data=="Not PM")
    feats_df = top_mean_feats(Weights, features, ids, min_tfidf=min_tfidf, top_n=top_n)
    feats_df.label = "Not PM"
    dfs.append(feats_df)

    return dfs

In [None]:
vec = TfidfVectorizer(max_features=50000)
tvecWeights = vec.fit_transform(data['title_abstract_mesh_stemmed'])
features = vec.get_feature_names()

# Human PM , Animal PM , Not PM

In [None]:
dfs = top_feats_by_class(tvecWeights, data["pm_rel_desc"], features)
newDict = {}
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

mergedPm = pd.merge(dfs[0], dfs[1], on="feature", how="outer", suffixes=["_animal", "_human"])
mergedPmNotPm = pd.merge(mergedPm, dfs[2], on="feature", how="outer", suffixes=["_pm", "_not_pm"])

with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    display(mergedPmNotPm)

## PM and NOT PM

In [None]:
dfs = top_feats_pm_notpm(tvecWeights, data["pm_rel_desc"], features)
newDict = {}

for df in dfs:
    print(df.label)
    print(df)
    print("\n")

merged = pd.merge(dfs[0], dfs[1], on="feature", how="outer", suffixes=["_pm", "_notpm"])

with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    display(merged)

# Relevance Score

In [None]:
dfs = top_feats_by_class(tvecWeights, data["relevance_score"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

## Topic

In [None]:
dfs = top_feats_by_class(tvecWeights, data["trec_topic_disease"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

## Topic + PM and NOT PM

In [None]:
data['topic_pm'] = data[['trec_topic_disease', 'pm_rel_desc']].apply(lambda x: ''.join(x.to_string(index=False).replace("\n"," ")), axis=1)
data.head()

dfs = top_feats_by_class(tvecWeights, data["topic_pm"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")

## Topic + Relevance

In [None]:
data['topic_relevance'] = data[['trec_topic_disease', 'relevance_score']].apply(lambda x: ''.join(x.to_string(index=False).replace("\n"," ")), axis=1)
data.head()

dfs = top_feats_by_class(tvecWeights, data["topic_relevance"], features)
for df in dfs:
    print(df.label)
    print(df)
    print("\n")