In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

start = time.time()
df = pd.read_csv("Paper_List.csv").fillna(' ')
print("Reading Time: ", time.time()-start)

Reading Time:  11.268141508102417


In [2]:
df.head(n=10)

Unnamed: 0,Abstract,ArticleTitle,"Level 1 : Analytical, Diagnostic and Therapeutic Techniques, and Equipment",Level 1 : Anatomy,"Level 1 : Anthropology, Education, Sociology, and Social Phenomena",Level 1 : Chemicals and Drugs,Level 1 : Disciplines and Occupations,Level 1 : Diseases,Level 1 : Geographicals,Level 1 : Health Care,...,"Level 2 : Surgical Procedures, Operative","Level 2 : Technology, Industry, and Agriculture",Level 2 : Therapeutics,Level 2 : Tissues,Level 2 : Urogenital System,Level 2 : Viral Structures,Level 2 : Virus Diseases,Level 2 : Viruses,Level 2 : Wounds and Injuries,Pub_Year
0,Double-blind clinical trials become very tedio...,The Automatic Patient Symptom Monitor (APSM): ...,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1993
1,This paper describes the design and implementa...,A pen-based system to support pre-operative da...,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1993
2,The INFORMM (Information Network for Online Re...,The patient problem/nursing diagnosis form: a ...,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1993
3,"We have developed a system that receives ""stat...",An X Window system for statlab results reporting.,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1993
4,We developed a clinical decision support syste...,Clinical performance of a rule-based decision ...,1,0,0,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,1993
5,A ventilator-management advisor (VMA) is a com...,Dynamic selection of models for a ventilator-m...,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1993
6,This paper describes a microcomputer system fo...,Providing clinicians with problem-based access...,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1993
7,Automatic detection of arterial oxygen desatur...,Computerized detection of arterial oxygen desa...,1,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1993
8,The objective of this study was to compare and...,Modeling mortality in the intensive care unit:...,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1993
9,"Currently, in most non-invasive imaging labora...",Non-invasive assessment of cardiovascular mech...,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1993


In [3]:
df.columns

Index(['Abstract', 'ArticleTitle',
       'Level 1 : Analytical, Diagnostic and Therapeutic Techniques, and Equipment',
       'Level 1 : Anatomy',
       'Level 1 : Anthropology, Education, Sociology, and Social Phenomena',
       'Level 1 : Chemicals and Drugs',
       'Level 1 : Disciplines and Occupations', 'Level 1 : Diseases',
       'Level 1 : Geographicals', 'Level 1 : Health Care',
       ...
       'Level 2 : Surgical Procedures, Operative',
       'Level 2 : Technology, Industry, and Agriculture',
       'Level 2 : Therapeutics', 'Level 2 : Tissues',
       'Level 2 : Urogenital System', 'Level 2 : Viral Structures',
       'Level 2 : Virus Diseases', 'Level 2 : Viruses',
       'Level 2 : Wounds and Injuries', 'Pub_Year'],
      dtype='object', length=137)

In [5]:
#Check for the classes that have no instances
for element in df.columns[2:-1]:
    if ((df[element]==0).all()):
        print(element)
        #Remove these classes
        df = df.drop(element, 1)

Level 1 : Publication Characteristics
Level 2 : Publication Components
Level 2 : Publication Formats
Level 2 : Study Characteristics
Level 2 : Support of Research


In [11]:
#Level 1 Columns
df.columns[2:17]

Index(['Level 1 : Analytical, Diagnostic and Therapeutic Techniques, and Equipment',
       'Level 1 : Anatomy',
       'Level 1 : Anthropology, Education, Sociology, and Social Phenomena',
       'Level 1 : Chemicals and Drugs',
       'Level 1 : Disciplines and Occupations', 'Level 1 : Diseases',
       'Level 1 : Geographicals', 'Level 1 : Health Care',
       'Level 1 : Humanities', 'Level 1 : Information Science',
       'Level 1 : Named Groups', 'Level 1 : Organisms',
       'Level 1 : Phenomena and Processes',
       'Level 1 : Psychiatry and Psychology',
       'Level 1 : Technology, Industry, and Agriculture'],
      dtype='object')

In [24]:
import pygal
from IPython.display import display, HTML

base_html = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
  </head>
  <body>
    <figure>
      {rendered_chart}
    </figure>
  </body>
</html>
"""

def galplot(chart):
    rendered_chart = chart.render(is_unicode=True)
    plot_html = base_html.format(rendered_chart=rendered_chart)
    display(HTML(plot_html))

def plot_class_dist(colum_names, df_name):
    line_chart = pygal.Bar()
    line_chart.title = 'Class Distribution in (%)'
    
    for element in colum_names:
        line_chart.add(element,len(df_name[df_name[element]==1])/len(df_name)*100)
        if len(df_name[df_name[element]==1])==0:
            print("No Instance in this class: ", element)
    galplot(line_chart)
    
#Level 1 class Distribution
plot_class_dist(df.columns[2:17], df)


In [25]:
#Level 2 distribution
plot_class_dist(df.columns[18:-1], df)

In [26]:
df_train = df[(1990<=df.Pub_Year) &(df.Pub_Year<2005)]
df_validation = df[(2005<=df.Pub_Year) &(df.Pub_Year<2010)]
df_test = df[(2010<=df.Pub_Year)]

len(df) == len(df_train) + len(df_validation) + len(df_test)

True

In [27]:
#Level 1 Distribution of Train, Validation, Test Set
plot_class_dist(df.columns[2:17], df_train)
plot_class_dist(df.columns[2:17], df_validation)
plot_class_dist(df.columns[2:17], df_test)

In [28]:
#Level 2 Distribution 
plot_class_dist(df.columns[18:-1], df_train)
plot_class_dist(df.columns[18:-1], df_validation)
plot_class_dist(df.columns[18:-1], df_test)

No Instance in this class:  Level 2 : Biomedical and Dental Materials
No Instance in this class:  Level 2 : Viral Structures


No Instance in this class:  Level 2 : Biomedical and Dental Materials
No Instance in this class:  Level 2 : Viral Structures


In [44]:
start = time.time()
#Combine abstract and title
df['abstract_title'] = df['ArticleTitle'].str.cat(df['Abstract'], sep='. ')

#Train test split
#Simple split %30~70
train, test = train_test_split(df, test_size=0.3)

print("Train Test Split Time: ", time.time()-start)

Train Test Split Time:  2.1834216117858887


In [66]:
#Creating Char and word Features
start = time.time()
train_text = train['abstract_title']
test_text = test['abstract_title']
all_text = pd.concat([train_text, test_text])
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_features = word_vectorizer.transform(train_text)
test_features = word_vectorizer.transform(test_text)

print("Feature Extraction Time: ", time.time()-start)

Feature Extraction Time:  175.90909266471863


In [67]:
scores = []
result = pd.DataFrame()
class_names = df.columns[2:-2]

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')

    cv_score = np.mean(cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))

    classifier.fit(train_features, train_target)
    result[class_name] = classifier.predict_proba(test_features)[:, 1]

print('Total CV score is {}'.format(np.mean(scores)))

CV score for class Level 1 : Analytical, Diagnostic and Therapeutic Techniques, and Equipment is 0.8291067756020079
CV score for class Level 1 : Anatomy is 0.8985346824442729
CV score for class Level 1 : Anthropology, Education, Sociology, and Social Phenomena is 0.9335018385253119
CV score for class Level 1 : Chemicals and Drugs is 0.9497715849263231
CV score for class Level 1 : Disciplines and Occupations is 0.8270804204107401
CV score for class Level 1 : Diseases is 0.9451510195441801
CV score for class Level 1 : Geographicals is 0.939949806612374
CV score for class Level 1 : Health Care is 0.8947013339002649
CV score for class Level 1 : Humanities is 0.9411584018987257
CV score for class Level 1 : Information Science is 0.8855034798462352
CV score for class Level 1 : Named Groups is 0.9564193813808473
CV score for class Level 1 : Organisms is 0.9561179065159271
CV score for class Level 1 : Phenomena and Processes is 0.8826374736717538
CV score for class Level 1 : Psychiatry and Psy

CV score for class Level 2 : Polycyclic Compounds is 0.9030856170104272
CV score for class Level 2 : Population Characteristics is 0.9447961101162695
CV score for class Level 2 : Psychological Phenomena is 0.9611731201657071
CV score for class Level 2 : Reproductive and Urinary Physiological Phenomena is 0.9552413015230613
CV score for class Level 2 : Respiratory System is 0.9674479966528388
CV score for class Level 2 : Respiratory Tract Diseases is 0.9708000094462982
CV score for class Level 2 : Sense Organs is 0.9715517859988864
CV score for class Level 2 : Skin and Connective Tissue Diseases is 0.9517749168221213
CV score for class Level 2 : Social Sciences is 0.915637234341245
CV score for class Level 2 : Stomatognathic Diseases is 0.9821906194808303
CV score for class Level 2 : Stomatognathic System is 0.970920427187585
CV score for class Level 2 : Surgical Procedures, Operative is 0.9527558459965957
CV score for class Level 2 : Technology, Industry, and Agriculture is 0.920175407

In [68]:
result

Unnamed: 0,"Level 1 : Analytical, Diagnostic and Therapeutic Techniques, and Equipment",Level 1 : Anatomy,"Level 1 : Anthropology, Education, Sociology, and Social Phenomena",Level 1 : Chemicals and Drugs,Level 1 : Disciplines and Occupations,Level 1 : Diseases,Level 1 : Geographicals,Level 1 : Health Care,Level 1 : Humanities,Level 1 : Information Science,...,Level 2 : Stomatognathic System,"Level 2 : Surgical Procedures, Operative","Level 2 : Technology, Industry, and Agriculture",Level 2 : Therapeutics,Level 2 : Tissues,Level 2 : Urogenital System,Level 2 : Viral Structures,Level 2 : Virus Diseases,Level 2 : Viruses,Level 2 : Wounds and Injuries
0,0.391396,0.971972,0.007108,0.994667,0.036947,0.177104,0.004650,0.062016,0.004186,0.052027,...,0.014295,0.011819,0.027394,0.013839,0.054589,0.019945,0.000008,0.005936,0.040550,0.006290
1,0.241360,0.085602,0.222853,0.186755,0.164713,0.196876,0.227402,0.618584,0.070178,0.403757,...,0.006819,0.013389,0.013626,0.047135,0.009628,0.006671,0.000008,0.005732,0.005671,0.010744
2,0.560748,0.513036,0.004644,0.994569,0.019448,0.079349,0.011220,0.051561,0.003186,0.706645,...,0.006950,0.007192,0.012535,0.014477,0.028665,0.014435,0.000008,0.020810,0.617193,0.003206
3,0.604428,0.773971,0.017457,0.936127,0.028187,0.963033,0.200502,0.241489,0.005085,0.045823,...,0.016379,0.029074,0.017805,0.035495,0.020819,0.010104,0.000008,0.013911,0.028817,0.014021
4,0.660717,0.956004,0.022873,0.490237,0.180439,0.070318,0.014577,0.100136,0.012544,0.101638,...,0.014302,0.065913,0.013150,0.042007,0.096385,0.006641,0.000008,0.005048,0.010016,0.008740
5,0.977108,0.864486,0.021779,0.455555,0.023575,0.607270,0.071950,0.866228,0.006171,0.333287,...,0.475610,0.152435,0.134476,0.066206,0.098039,0.009494,0.000008,0.003110,0.005351,0.036050
6,0.604030,0.316727,0.034832,0.166407,0.059269,0.199339,0.027822,0.356996,0.023911,0.105434,...,0.013986,0.019927,0.021712,0.031357,0.014639,0.009051,0.000008,0.006728,0.010422,0.013833
7,0.236768,0.204313,0.025985,0.910998,0.044802,0.144440,0.318088,0.785220,0.009236,0.032797,...,0.012121,0.012479,0.141424,0.042204,0.011534,0.006942,0.000008,0.010047,0.014079,0.012115
8,0.561769,0.912599,0.006667,0.991723,0.100356,0.095053,0.012961,0.071967,0.002316,0.066420,...,0.013580,0.014268,0.020717,0.024617,0.062959,0.024955,0.000008,0.006339,0.019835,0.004136
9,0.405930,0.116587,0.034332,0.930914,0.066322,0.882783,0.112897,0.642203,0.007877,0.122056,...,0.003208,0.021667,0.013501,0.036591,0.020403,0.009587,0.000008,0.006134,0.009261,0.004386


In [89]:
#Assign Mesh Terms with a treshold of 0.5
result_in_short = []
defined_scores = []
for i in range(len(result)):
    tmp = result.iloc[i][result.iloc[i] >  0.5].keys()
    ground_truth = test.iloc[i][test.iloc[i]==1].keys()
    defined_score = len(tmp.intersection(ground_truth)) / len(ground_truth)
    defined_scores.append(defined_score)
    result_in_short.append([tmp, ground_truth, defined_score])
print("Overall Result: ", np.mean(defined_scores))

Overall Result:  0.6422162010996841


In [93]:
result_in_short[0:5]

[[Index(['Level 1 : Analytical, Diagnostic and Therapeutic Techniques, and Equipment',
         'Level 1 : Anatomy', 'Level 1 : Chemicals and Drugs',
         'Level 1 : Organisms', 'Level 1 : Phenomena and Processes',
         'Level 2 : Amino Acids, Peptides, and Proteins',
         'Level 2 : Biological Factors',
         'Level 2 : Cell Physiological Phenomena', 'Level 2 : Cells',
         'Level 2 : Eukaryota', 'Level 2 : Genetic Phenomena',
         'Level 2 : Hemic and Immune Systems',
         'Level 2 : Investigative Techniques',
         'Level 2 : Physiological Phenomena'],
        dtype='object'),
  Index(['Level 1 : Analytical, Diagnostic and Therapeutic Techniques, and Equipment',
         'Level 1 : Anatomy', 'Level 1 : Chemicals and Drugs',
         'Level 1 : Information Science', 'Level 1 : Organisms',
         'Level 1 : Phenomena and Processes',
         'Level 2 : Amino Acids, Peptides, and Proteins',
         'Level 2 : Biological Factors',
         'Level 2 : Cel

In [92]:
#Assign Mesh Terms with a treshold of 0.35
result_in_short = []
defined_scores = []
for i in range(len(result)):
    tmp = result.iloc[i][result.iloc[i] >  0.35].keys()
    ground_truth = test.iloc[i][test.iloc[i]==1].keys()
    defined_score = len(tmp.intersection(ground_truth)) / len(ground_truth)
    defined_scores.append(defined_score)
    result_in_short.append([tmp, ground_truth, defined_score])
print("Overall Result: ", np.mean(defined_scores))

Overall Result:  0.7391841483650327
