## Dependencies

In [1]:
import pandas as pd
from sklearn.utils import Bunch
import regex as re
import sqlite3
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer, BertForMaskedLM
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
pd.set_option('display.max_colwidth', None)
import torch
import torch.nn as nn
from transformers import AdamW
import torch.nn.functional as F

We establish a connection to our database using the sqlite module

In [2]:
connection = sqlite3.connect('publications_graph.db')
cursor = connection.cursor()

Get the schema for all tables in the database.

In [3]:
cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

for table in tables:
    print("Table Name:", table[0])
    print("Table Schema:", table[1])


Table Name: works
Table Schema: CREATE TABLE works(
published_year,
published_month,
abstract,
title,
doi,
published_day
)
Table Name: work_references
Table Schema: CREATE TABLE work_references(
work_id,
doi
)
Table Name: author_affiliations
Table Schema: CREATE TABLE author_affiliations(
author_id,
name
)
Table Name: work_subjects
Table Schema: CREATE TABLE work_subjects(
name,
work_id
)
Table Name: work_authors
Table Schema: CREATE TABLE work_authors(
work_id,
id,
orcid
)
Table Name: cdindex
Table Schema: CREATE TABLE cdindex(doi, cdindex)
Table Name: data
Table Schema: CREATE TABLE data(
  doi,
  title,
  abstract,
  cdindex,
  published_month,
  published_year
)


Create a new table by joining the abstracts with the cdindex based on doi column

In [4]:
cursor.execute('CREATE TABLE data AS SELECT works.doi, title, abstract, cdindex, published_month, published_year FROM cdindex JOIN works ON cdindex.doi = works.doi WHERE cdindex IS NOT NULL')
connection.commit()

OperationalError: table data already exists

We put our data in a dataframe

In [5]:
df = pd.read_sql_query('SELECT abstract, cdindex FROM data', connection)
df

Unnamed: 0,abstract,cdindex
0,"<jats:title>Abstract</jats:title><jats:p>The photolysis of dilute solutions of octacyclosulphur or hexacyclosulphur in n-hexane with 253.6 nm UV radiation produces S and possibly S<jats:sub>2</jats:sub>. The ‘ring-opening’ yields of these sulphur molecules range from 0.2 to 0.7. When the hydrogen end-capped polyyne C<jats:sub>10</jats:sub>H<jats:sub>2</jats:sub> is irradiated in n-hexane, it transforms into unidentified products with a quantum yield of 3×10<jats:sup>−5</jats:sup>. When octacyclosulphur is added to the solution, the yield rises to 7×10<jats:sup>−3</jats:sup>. The putative sulphur-bearing product(s) could not be identified. It is suggested that sulphur-bearing molecules might be formed in astronomical settings by reactions of carbon molecules having triple or double C—C bonds with photolytically produced S and/or S<jats:sub>2</jats:sub>.</jats:p>",0.000000
1,"<jats:title>Abstract</jats:title><jats:p>Examples are given of unforeseen damage or difficulties arising during or after construction of buildings, wharves, foundations, tunnels, in-ground storage, pipe lines, and embankments. The underlying potentially latent mechanisms of groundwater are classified into four groups, namely water pressure changes, physical changes, erosion, and affects of earthworks, in order to assist early identification of some of the types of change in ground properties which may be induced by site operations and may lead to damage. It is concluded that otherwise unforeseen damage is more likely to be averted by a greater degree of investigation during construction into the consequences of the construction sequences, method and plant used.</jats:p>",0.000000
2,"<jats:p> Acute hypoxia causes pulmonary vasoconstriction and coronary vasodilation. The divergent effects of hypoxia on pulmonary and coronary vascular smooth muscle cells suggest that the mechanisms involved in oxygen sensing and downstream effectors are different in these two types of cells. Since production of reactive oxygen species (ROS) is regulated by oxygen tension, ROS have been hypothesized to be a signaling mechanism in hypoxia-induced pulmonary vasoconstriction and vascular remodeling. Furthermore, an increased ROS production is also implicated in arteriosclerosis. In this study, we determined and compared the effects of hypoxia on ROS levels in human pulmonary arterial smooth muscle cells (PASMC) and coronary arterial smooth muscle cells (CASMC). Our results indicated that acute exposure to hypoxia (Po<jats:sub>2</jats:sub> = 25–30 mmHg for 5–10 min) significantly and rapidly decreased ROS levels in both PASMC and CASMC. However, chronic exposure to hypoxia (Po<jats:sub>2</jats:sub> = 30 mmHg for 48 h) markedly increased ROS levels in PASMC, but decreased ROS production in CASMC. Furthermore, chronic treatment with endothelin-1, a potent vasoconstrictor and mitogen, caused a significant increase in ROS production in both PASMC and CASMC. The inhibitory effect of acute hypoxia on ROS production in PASMC was also accelerated in cells chronically treated with endothelin-1. While the decreased ROS in PASMC and CASMC after acute exposure to hypoxia may reflect the lower level of oxygen substrate available for ROS production, the increased ROS production in PASMC during chronic hypoxia may reflect a pathophysiological response unique to the pulmonary vasculature that contributes to the development of pulmonary vascular remodeling in patients with hypoxia-associated pulmonary hypertension. </jats:p>",-0.002938
3,"<jats:title>Abstract</jats:title>\n <jats:p>Despite rapid advances in the stem cell field, the ability to identify and track transplanted or migrating stem cells in vivo is limited. To overcome this limitation, we used magnetic resonance imaging (MRI) to detect and follow transplanted stem cells over a period of 28 days in mice using an established myocardial infarction model. Pluripotent mouse embryonic stem (mES) cells were expanded and induced to differentiate into beating cardiomyocytes in vitro. The cardiac-differentiated mES cells were then loaded with superparamagnetic fluorescent microspheres (1.63 μm in diameter) and transplanted into ischemic myocardium immediately following ligation and subsequent reperfusion of the left anterior descending coronary artery. To identify the transplanted stem cells in vivo, MRI was performed using a Varian Inova 4.7 Tesla scanner. Our results show that (a) the cardiac-differentiated mES were effectively loaded with superparamagnetic microspheres in vitro, (b) the microsphere-loaded mES cells continued to beat in culture prior to transplantation, (c) the transplanted mES cells were readily detected in the heart in vivo using noninvasive MRI techniques, (d) the transplanted stem cells were detected in ischemic myocardium for the entire 28-day duration of the study as confirmed by MRI and post-mortem histological analyses, and (e) concurrent functional MRI indicated typical loss of cardiac function, although significant amelioration of remodeling was noted after 28 days in hearts that received transplanted stem cells. These results demonstrate that it is feasible to simultaneously track transplanted stem cells and monitor cardiac function in vivo over an extended period using noninvasive MRI techniques.</jats:p>\n <jats:p>Disclosure of potential conflicts of interest is found at the end of this article.</jats:p>",-0.004497
4,"<jats:title>Abstract</jats:title>\n <jats:p>For the regenerative therapy of osteochondral defects – deep lesions of the articular cartilage in which the underlying bone tissue is already affected too – special implant materials and scaffolds are needed. In this study, two new approaches will be presented, leading to biphasic, but monolithic scaffold materials. Both consist of a mineralised layer for filling of the bony part of the defect and a non-mineralised one for the chondral part. Due to the preparation methods, both layers are fused together to give a unified whole without need of any artificial joining. The resulting materials, either based on collagen, hyaluronic acid and hydroxyapatite or calcium alginate gels and hydroxyapatite, seem to be suitable scaffolds for cultivating chondrocytes and osteoblasts – and therefore can act as matrices for tissue engineering of osteochondral grafts.</jats:p>",-0.004373
...,...,...
80276,"<jats:p>The nymphal instars I and III - V of Sigara (Tropocorixa) denseconscripta (Breddin, 1897) are figured and described in detail, for the first time, with emphasis on morphometry and chaetotaxy of selected structures. The useful characters to identify the nymphal instars and the nymphs of the species of Sigara are provided.</jats:p>",-0.100000
80277,"<jats:p> The possible effects of a wide range of sociodemographic and environmental factors on the incidence and distribution of petechiae were investigated in 485 sudden infant death syndrome (SIDS) cases from the New Zealand Cot Death Study. The number (nil, few, many) of macroscopic petechial hemorrhages in the visceral pleura, capsule of thymus, and epicardium was recorded in 458 of 474 autopsied SIDS cases. Other information was obtained from parental interview and obstetric records. Univariate analysis showed highly significant relationships ( P ≤ 0.005) between the frequency of petechiae at one or more sites and socioeconomic status, parity, breast feeding, age at death, time of death, sleep position, and head covering at death and lesser but significant relationships ( P ≤ 0.05) with Maori ethnicity, birth weight, gestation, pacifier use, and bed sharing. After multivariate analysis, significant associations remained between increased frequencies of thymic petechiae and parity (P = 0.0001), age at death (P = 0.0003), Maori ethnicity (P = 0.0019), pacifier use (P = 0.0001), and head covering at death (P = 0.0032); between increased frequencies of epicardial petechiae and head covering at death (P = 0.008) and an estimated time of death between 00:00 and 05:59 h ( P = 0.056); and between increased frequencies of pleural petechiae and maternal smoking ( P = 0.058) and parity ( P = 0.022). There was a decreased frequency of pleural petechiae in infants placed prone for their final sleep ( P = 0.058). The distribution and frequency of petechiae are affected by environmental factors, including known risk factors for SIDS, but these factors occur inconsistently across the three sites. The findings imply differences in the pathogenesis at each site but do not provide consistent support for previous theories of causation of petechiae. </jats:p>",-0.007576
80278,"<jats:p>After electroconvulsive therapy (ECT), many patients experience a decrement in their mnestic capacity. We studied episodic memory in eight severely depressed patients treated with a course of right-unilateral ECT. For this purpose, a testing instrument was constructed by the authors. It was made of paper cards that held four pieces of information, namely a word, a number, a figure, and the color of the card. One of the cards was presented to the patients and the respective information was asked for on the subsequent day. Patients were tested every morning during the first two weeks of the ECT course. About half of the responses were correct. Patients did best in recalling the color; they did worst in recalling the number. Seven of the patients showed verbal perseverations. This is in accordance with the literature on perseveration in patients with neurologic deficits, especially in proactive-inhibitory tasks. Perseveration may be attributed to a deficit in selective attention, producing an arousal of irrelevant cues.</jats:p>",-0.040000
80279,"<jats:p>This article looks at the issue — largely neglected in the transition literature — of the relative weights of the privatized sector and the generic private sector (of de novo private firms) in the emerging private sector of post-communist economies in transition. The present writer posits that the relative weight of each in the aggregate share of a private sector (generally expanding over time as transition progresses) strongly influences economic performance, both during correctional recession and during recovery and expansion period. Another, interrelated issue considered here is the interaction between the evolving institutional framework and the expansion of the generic private sector, that is the most dynamic one in the transition economy. It is true that the interaction between institutions and performance has been a staple of a very large number of books, articles, and papers.</jats:p>\n <jats:p>However, this article concentrates on one component of a private sector only, that is the generic private sector. But at the same time it looks beyond the ‘Holy Trinity’ of transition (stabilization, liberalization, and privatization) towards a wider institutional framework of political liberty, law and order. The foregoing wider framework, and the emerging general trust, matters as much — if not more — for the present writer as the standard transition program.</jats:p>\n <jats:p>It is the relative dynamics of both components of the private sector, affected by both standard transition programs and the above-mentioned wider institutional framework, that is of primary importance for the economic performance in post-communist transition. In the last part of the article I will try also to answer, tentatively, the question under which circumstances the wider institutional framework may emerge in the transition process.</jats:p>",0.100000


We detect that the data have a format where like typical html code we have

< lorem ipsum >Lorem Ipsum < /lorem ipsum >.

So we clear all the < > while keeping the whitespace between words and remove the word 'Abstact' that some rows have as a first word.

In [6]:
df['abstract'] = df['abstract'].apply(lambda x: re.sub('<.*?>', '', x).replace('Abstract', '').strip())
df['abstract'].head(2)

0                                     The photolysis of dilute solutions of octacyclosulphur or hexacyclosulphur in n-hexane with 253.6 nm UV radiation produces S and possibly S2. The ‘ring-opening’ yields of these sulphur molecules range from 0.2 to 0.7. When the hydrogen end-capped polyyne C10H2 is irradiated in n-hexane, it transforms into unidentified products with a quantum yield of 3×10−5. When octacyclosulphur is added to the solution, the yield rises to 7×10−3. The putative sulphur-bearing product(s) could not be identified. It is suggested that sulphur-bearing molecules might be formed in astronomical settings by reactions of carbon molecules having triple or double C—C bonds with photolytically produced S and/or S2.
1    Examples are given of unforeseen damage or difficulties arising during or after construction of buildings, wharves, foundations, tunnels, in-ground storage, pipe lines, and embankments. The underlying potentially latent mechanisms of groundwater are classifie

In [7]:
df.to_csv('data/data.csv')

In [8]:
df_sup = pd.read_csv('data/data.csv')
df_sup.drop(columns='Unnamed: 0', inplace=True)
df_sup

Unnamed: 0,abstract,cdindex
0,"The photolysis of dilute solutions of octacyclosulphur or hexacyclosulphur in n-hexane with 253.6 nm UV radiation produces S and possibly S2. The ‘ring-opening’ yields of these sulphur molecules range from 0.2 to 0.7. When the hydrogen end-capped polyyne C10H2 is irradiated in n-hexane, it transforms into unidentified products with a quantum yield of 3×10−5. When octacyclosulphur is added to the solution, the yield rises to 7×10−3. The putative sulphur-bearing product(s) could not be identified. It is suggested that sulphur-bearing molecules might be formed in astronomical settings by reactions of carbon molecules having triple or double C—C bonds with photolytically produced S and/or S2.",0.000000
1,"Examples are given of unforeseen damage or difficulties arising during or after construction of buildings, wharves, foundations, tunnels, in-ground storage, pipe lines, and embankments. The underlying potentially latent mechanisms of groundwater are classified into four groups, namely water pressure changes, physical changes, erosion, and affects of earthworks, in order to assist early identification of some of the types of change in ground properties which may be induced by site operations and may lead to damage. It is concluded that otherwise unforeseen damage is more likely to be averted by a greater degree of investigation during construction into the consequences of the construction sequences, method and plant used.",0.000000
2,"Acute hypoxia causes pulmonary vasoconstriction and coronary vasodilation. The divergent effects of hypoxia on pulmonary and coronary vascular smooth muscle cells suggest that the mechanisms involved in oxygen sensing and downstream effectors are different in these two types of cells. Since production of reactive oxygen species (ROS) is regulated by oxygen tension, ROS have been hypothesized to be a signaling mechanism in hypoxia-induced pulmonary vasoconstriction and vascular remodeling. Furthermore, an increased ROS production is also implicated in arteriosclerosis. In this study, we determined and compared the effects of hypoxia on ROS levels in human pulmonary arterial smooth muscle cells (PASMC) and coronary arterial smooth muscle cells (CASMC). Our results indicated that acute exposure to hypoxia (Po2 = 25–30 mmHg for 5–10 min) significantly and rapidly decreased ROS levels in both PASMC and CASMC. However, chronic exposure to hypoxia (Po2 = 30 mmHg for 48 h) markedly increased ROS levels in PASMC, but decreased ROS production in CASMC. Furthermore, chronic treatment with endothelin-1, a potent vasoconstrictor and mitogen, caused a significant increase in ROS production in both PASMC and CASMC. The inhibitory effect of acute hypoxia on ROS production in PASMC was also accelerated in cells chronically treated with endothelin-1. While the decreased ROS in PASMC and CASMC after acute exposure to hypoxia may reflect the lower level of oxygen substrate available for ROS production, the increased ROS production in PASMC during chronic hypoxia may reflect a pathophysiological response unique to the pulmonary vasculature that contributes to the development of pulmonary vascular remodeling in patients with hypoxia-associated pulmonary hypertension.",-0.002938
3,"Despite rapid advances in the stem cell field, the ability to identify and track transplanted or migrating stem cells in vivo is limited. To overcome this limitation, we used magnetic resonance imaging (MRI) to detect and follow transplanted stem cells over a period of 28 days in mice using an established myocardial infarction model. Pluripotent mouse embryonic stem (mES) cells were expanded and induced to differentiate into beating cardiomyocytes in vitro. The cardiac-differentiated mES cells were then loaded with superparamagnetic fluorescent microspheres (1.63 μm in diameter) and transplanted into ischemic myocardium immediately following ligation and subsequent reperfusion of the left anterior descending coronary artery. To identify the transplanted stem cells in vivo, MRI was performed using a Varian Inova 4.7 Tesla scanner. Our results show that (a) the cardiac-differentiated mES were effectively loaded with superparamagnetic microspheres in vitro, (b) the microsphere-loaded mES cells continued to beat in culture prior to transplantation, (c) the transplanted mES cells were readily detected in the heart in vivo using noninvasive MRI techniques, (d) the transplanted stem cells were detected in ischemic myocardium for the entire 28-day duration of the study as confirmed by MRI and post-mortem histological analyses, and (e) concurrent functional MRI indicated typical loss of cardiac function, although significant amelioration of remodeling was noted after 28 days in hearts that received transplanted stem cells. These results demonstrate that it is feasible to simultaneously track transplanted stem cells and monitor cardiac function in vivo over an extended period using noninvasive MRI techniques.\n Disclosure of potential conflicts of interest is found at the end of this article.",-0.004497
4,"For the regenerative therapy of osteochondral defects – deep lesions of the articular cartilage in which the underlying bone tissue is already affected too – special implant materials and scaffolds are needed. In this study, two new approaches will be presented, leading to biphasic, but monolithic scaffold materials. Both consist of a mineralised layer for filling of the bony part of the defect and a non-mineralised one for the chondral part. Due to the preparation methods, both layers are fused together to give a unified whole without need of any artificial joining. The resulting materials, either based on collagen, hyaluronic acid and hydroxyapatite or calcium alginate gels and hydroxyapatite, seem to be suitable scaffolds for cultivating chondrocytes and osteoblasts – and therefore can act as matrices for tissue engineering of osteochondral grafts.",-0.004373
...,...,...
80276,"The nymphal instars I and III - V of Sigara (Tropocorixa) denseconscripta (Breddin, 1897) are figured and described in detail, for the first time, with emphasis on morphometry and chaetotaxy of selected structures. The useful characters to identify the nymphal instars and the nymphs of the species of Sigara are provided.",-0.100000
80277,"The possible effects of a wide range of sociodemographic and environmental factors on the incidence and distribution of petechiae were investigated in 485 sudden infant death syndrome (SIDS) cases from the New Zealand Cot Death Study. The number (nil, few, many) of macroscopic petechial hemorrhages in the visceral pleura, capsule of thymus, and epicardium was recorded in 458 of 474 autopsied SIDS cases. Other information was obtained from parental interview and obstetric records. Univariate analysis showed highly significant relationships ( P ≤ 0.005) between the frequency of petechiae at one or more sites and socioeconomic status, parity, breast feeding, age at death, time of death, sleep position, and head covering at death and lesser but significant relationships ( P ≤ 0.05) with Maori ethnicity, birth weight, gestation, pacifier use, and bed sharing. After multivariate analysis, significant associations remained between increased frequencies of thymic petechiae and parity (P = 0.0001), age at death (P = 0.0003), Maori ethnicity (P = 0.0019), pacifier use (P = 0.0001), and head covering at death (P = 0.0032); between increased frequencies of epicardial petechiae and head covering at death (P = 0.008) and an estimated time of death between 00:00 and 05:59 h ( P = 0.056); and between increased frequencies of pleural petechiae and maternal smoking ( P = 0.058) and parity ( P = 0.022). There was a decreased frequency of pleural petechiae in infants placed prone for their final sleep ( P = 0.058). The distribution and frequency of petechiae are affected by environmental factors, including known risk factors for SIDS, but these factors occur inconsistently across the three sites. The findings imply differences in the pathogenesis at each site but do not provide consistent support for previous theories of causation of petechiae.",-0.007576
80278,"After electroconvulsive therapy (ECT), many patients experience a decrement in their mnestic capacity. We studied episodic memory in eight severely depressed patients treated with a course of right-unilateral ECT. For this purpose, a testing instrument was constructed by the authors. It was made of paper cards that held four pieces of information, namely a word, a number, a figure, and the color of the card. One of the cards was presented to the patients and the respective information was asked for on the subsequent day. Patients were tested every morning during the first two weeks of the ECT course. About half of the responses were correct. Patients did best in recalling the color; they did worst in recalling the number. Seven of the patients showed verbal perseverations. This is in accordance with the literature on perseveration in patients with neurologic deficits, especially in proactive-inhibitory tasks. Perseveration may be attributed to a deficit in selective attention, producing an arousal of irrelevant cues.",-0.040000
80279,"This article looks at the issue — largely neglected in the transition literature — of the relative weights of the privatized sector and the generic private sector (of de novo private firms) in the emerging private sector of post-communist economies in transition. The present writer posits that the relative weight of each in the aggregate share of a private sector (generally expanding over time as transition progresses) strongly influences economic performance, both during correctional recession and during recovery and expansion period. Another, interrelated issue considered here is the interaction between the evolving institutional framework and the expansion of the generic private sector, that is the most dynamic one in the transition economy. It is true that the interaction between institutions and performance has been a staple of a very large number of books, articles, and papers.\n However, this article concentrates on one component of a private sector only, that is the generic private sector. But at the same time it looks beyond the ‘Holy Trinity’ of transition (stabilization, liberalization, and privatization) towards a wider institutional framework of political liberty, law and order. The foregoing wider framework, and the emerging general trust, matters as much — if not more — for the present writer as the standard transition program.\n It is the relative dynamics of both components of the private sector, affected by both standard transition programs and the above-mentioned wider institutional framework, that is of primary importance for the economic performance in post-communist transition. In the last part of the article I will try also to answer, tentatively, the question under which circumstances the wider institutional framework may emerge in the transition process.",0.100000


In [9]:
df_sup['abstract'] = df_sup['abstract'].apply(lambda x: re.sub(r'<.*?>', '', str(x)).replace("abstract", "").strip().lower())
df_sup

Unnamed: 0,abstract,cdindex
0,"the photolysis of dilute solutions of octacyclosulphur or hexacyclosulphur in n-hexane with 253.6 nm uv radiation produces s and possibly s2. the ‘ring-opening’ yields of these sulphur molecules range from 0.2 to 0.7. when the hydrogen end-capped polyyne c10h2 is irradiated in n-hexane, it transforms into unidentified products with a quantum yield of 3×10−5. when octacyclosulphur is added to the solution, the yield rises to 7×10−3. the putative sulphur-bearing product(s) could not be identified. it is suggested that sulphur-bearing molecules might be formed in astronomical settings by reactions of carbon molecules having triple or double c—c bonds with photolytically produced s and/or s2.",0.000000
1,"examples are given of unforeseen damage or difficulties arising during or after construction of buildings, wharves, foundations, tunnels, in-ground storage, pipe lines, and embankments. the underlying potentially latent mechanisms of groundwater are classified into four groups, namely water pressure changes, physical changes, erosion, and affects of earthworks, in order to assist early identification of some of the types of change in ground properties which may be induced by site operations and may lead to damage. it is concluded that otherwise unforeseen damage is more likely to be averted by a greater degree of investigation during construction into the consequences of the construction sequences, method and plant used.",0.000000
2,"acute hypoxia causes pulmonary vasoconstriction and coronary vasodilation. the divergent effects of hypoxia on pulmonary and coronary vascular smooth muscle cells suggest that the mechanisms involved in oxygen sensing and downstream effectors are different in these two types of cells. since production of reactive oxygen species (ros) is regulated by oxygen tension, ros have been hypothesized to be a signaling mechanism in hypoxia-induced pulmonary vasoconstriction and vascular remodeling. furthermore, an increased ros production is also implicated in arteriosclerosis. in this study, we determined and compared the effects of hypoxia on ros levels in human pulmonary arterial smooth muscle cells (pasmc) and coronary arterial smooth muscle cells (casmc). our results indicated that acute exposure to hypoxia (po2 = 25–30 mmhg for 5–10 min) significantly and rapidly decreased ros levels in both pasmc and casmc. however, chronic exposure to hypoxia (po2 = 30 mmhg for 48 h) markedly increased ros levels in pasmc, but decreased ros production in casmc. furthermore, chronic treatment with endothelin-1, a potent vasoconstrictor and mitogen, caused a significant increase in ros production in both pasmc and casmc. the inhibitory effect of acute hypoxia on ros production in pasmc was also accelerated in cells chronically treated with endothelin-1. while the decreased ros in pasmc and casmc after acute exposure to hypoxia may reflect the lower level of oxygen substrate available for ros production, the increased ros production in pasmc during chronic hypoxia may reflect a pathophysiological response unique to the pulmonary vasculature that contributes to the development of pulmonary vascular remodeling in patients with hypoxia-associated pulmonary hypertension.",-0.002938
3,"despite rapid advances in the stem cell field, the ability to identify and track transplanted or migrating stem cells in vivo is limited. to overcome this limitation, we used magnetic resonance imaging (mri) to detect and follow transplanted stem cells over a period of 28 days in mice using an established myocardial infarction model. pluripotent mouse embryonic stem (mes) cells were expanded and induced to differentiate into beating cardiomyocytes in vitro. the cardiac-differentiated mes cells were then loaded with superparamagnetic fluorescent microspheres (1.63 μm in diameter) and transplanted into ischemic myocardium immediately following ligation and subsequent reperfusion of the left anterior descending coronary artery. to identify the transplanted stem cells in vivo, mri was performed using a varian inova 4.7 tesla scanner. our results show that (a) the cardiac-differentiated mes were effectively loaded with superparamagnetic microspheres in vitro, (b) the microsphere-loaded mes cells continued to beat in culture prior to transplantation, (c) the transplanted mes cells were readily detected in the heart in vivo using noninvasive mri techniques, (d) the transplanted stem cells were detected in ischemic myocardium for the entire 28-day duration of the study as confirmed by mri and post-mortem histological analyses, and (e) concurrent functional mri indicated typical loss of cardiac function, although significant amelioration of remodeling was noted after 28 days in hearts that received transplanted stem cells. these results demonstrate that it is feasible to simultaneously track transplanted stem cells and monitor cardiac function in vivo over an extended period using noninvasive mri techniques.\n disclosure of potential conflicts of interest is found at the end of this article.",-0.004497
4,"for the regenerative therapy of osteochondral defects – deep lesions of the articular cartilage in which the underlying bone tissue is already affected too – special implant materials and scaffolds are needed. in this study, two new approaches will be presented, leading to biphasic, but monolithic scaffold materials. both consist of a mineralised layer for filling of the bony part of the defect and a non-mineralised one for the chondral part. due to the preparation methods, both layers are fused together to give a unified whole without need of any artificial joining. the resulting materials, either based on collagen, hyaluronic acid and hydroxyapatite or calcium alginate gels and hydroxyapatite, seem to be suitable scaffolds for cultivating chondrocytes and osteoblasts – and therefore can act as matrices for tissue engineering of osteochondral grafts.",-0.004373
...,...,...
80276,"the nymphal instars i and iii - v of sigara (tropocorixa) denseconscripta (breddin, 1897) are figured and described in detail, for the first time, with emphasis on morphometry and chaetotaxy of selected structures. the useful characters to identify the nymphal instars and the nymphs of the species of sigara are provided.",-0.100000
80277,"the possible effects of a wide range of sociodemographic and environmental factors on the incidence and distribution of petechiae were investigated in 485 sudden infant death syndrome (sids) cases from the new zealand cot death study. the number (nil, few, many) of macroscopic petechial hemorrhages in the visceral pleura, capsule of thymus, and epicardium was recorded in 458 of 474 autopsied sids cases. other information was obtained from parental interview and obstetric records. univariate analysis showed highly significant relationships ( p ≤ 0.005) between the frequency of petechiae at one or more sites and socioeconomic status, parity, breast feeding, age at death, time of death, sleep position, and head covering at death and lesser but significant relationships ( p ≤ 0.05) with maori ethnicity, birth weight, gestation, pacifier use, and bed sharing. after multivariate analysis, significant associations remained between increased frequencies of thymic petechiae and parity (p = 0.0001), age at death (p = 0.0003), maori ethnicity (p = 0.0019), pacifier use (p = 0.0001), and head covering at death (p = 0.0032); between increased frequencies of epicardial petechiae and head covering at death (p = 0.008) and an estimated time of death between 00:00 and 05:59 h ( p = 0.056); and between increased frequencies of pleural petechiae and maternal smoking ( p = 0.058) and parity ( p = 0.022). there was a decreased frequency of pleural petechiae in infants placed prone for their final sleep ( p = 0.058). the distribution and frequency of petechiae are affected by environmental factors, including known risk factors for sids, but these factors occur inconsistently across the three sites. the findings imply differences in the pathogenesis at each site but do not provide consistent support for previous theories of causation of petechiae.",-0.007576
80278,"after electroconvulsive therapy (ect), many patients experience a decrement in their mnestic capacity. we studied episodic memory in eight severely depressed patients treated with a course of right-unilateral ect. for this purpose, a testing instrument was constructed by the authors. it was made of paper cards that held four pieces of information, namely a word, a number, a figure, and the color of the card. one of the cards was presented to the patients and the respective information was asked for on the subsequent day. patients were tested every morning during the first two weeks of the ect course. about half of the responses were correct. patients did best in recalling the color; they did worst in recalling the number. seven of the patients showed verbal perseverations. this is in accordance with the literature on perseveration in patients with neurologic deficits, especially in proactive-inhibitory tasks. perseveration may be attributed to a deficit in selective attention, producing an arousal of irrelevant cues.",-0.040000
80279,"this article looks at the issue — largely neglected in the transition literature — of the relative weights of the privatized sector and the generic private sector (of de novo private firms) in the emerging private sector of post-communist economies in transition. the present writer posits that the relative weight of each in the aggregate share of a private sector (generally expanding over time as transition progresses) strongly influences economic performance, both during correctional recession and during recovery and expansion period. another, interrelated issue considered here is the interaction between the evolving institutional framework and the expansion of the generic private sector, that is the most dynamic one in the transition economy. it is true that the interaction between institutions and performance has been a staple of a very large number of books, articles, and papers.\n however, this article concentrates on one component of a private sector only, that is the generic private sector. but at the same time it looks beyond the ‘holy trinity’ of transition (stabilization, liberalization, and privatization) towards a wider institutional framework of political liberty, law and order. the foregoing wider framework, and the emerging general trust, matters as much — if not more — for the present writer as the standard transition program.\n it is the relative dynamics of both components of the private sector, affected by both standard transition programs and the above-mentioned wider institutional framework, that is of primary importance for the economic performance in post-communist transition. in the last part of the article i will try also to answer, tentatively, the question under which circumstances the wider institutional framework may emerge in the transition process.",0.100000


In [10]:
df_un = pd.read_sql_query('SELECT title, abstract FROM works LEFT JOIN cdindex ON works.doi = cdindex.doi WHERE cdindex.doi IS NULL;', connection)
df_un.head(2)

Unnamed: 0,title,abstract
0,Access Network Selection in a 4G Environment,"<jats:p>4G networks provide bandwidth of up to 1Gbps for a Mobile Node (MN) that is moving at pedestrian speed. On the other hand, it also supports mobile nodes that can move at a speed of 250 km/hr with bandwidths value of 100 Mbps. This sets the premise of a network that supports diverse needs. This goal will be harder to achieve if Network Selection Problems (NSP) are not addressed comprehensively. NSP refers to the selection of target access network selection from a collection of Candidate Networks (CNs) when MNs are moving from one access network into another. The most logical way of achieving this is to select the “best” network. This translates to identifying performance values of the CNs. The analysis in this chapter shows clearly that access network selection done based on limited criteria is detrimental in achieving optimum communication. Instead, this chapter suggests a framework that would be complementary to a 4G network.</jats:p>"
1,Environmental Disclosures and Impression Management,"<jats:p>A significant stream of social and environmental accounting research investigates the relationship between a corporation’s self-reported disclosures of its own social responsibility and environmental activities and third-party evaluations of that corporation’s actual social responsibility and environmental performance. Generally, researchers have utilized one of two theories to motivate and test this relationship. One theory—signaling or voluntary disclosure theory—argues that corporations with superior corporate social responsibility or environmental performance use disclosure to signal to interested parties a level of performance that poorer corporate performers cannot disclose. A second theory—legitimacy or impression management theory—argues that corporations use disclosures to manage impressions, often masking their actual social responsibility and environmental performance. In this chapter, the authors seek to comment on how DICTION has been and can be utilized to advance this stream of social and environmental accounting research. </jats:p>"


In [11]:
df_un.to_csv('data/unsupervised_data.csv')

In order to evaluate both models on exactly the same data we create a subset of our DataFrame that is the one tenth of our total data. 

Random state ensures reproducability when rerunning the program.

In [12]:
test_df = pd.DataFrame(df_sup.sample(frac=0.1, random_state=42))
test_df.reset_index(inplace=True)
test_df.head(3)

Unnamed: 0,index,abstract,cdindex
0,9272,"it has been successfully demonstrated that ceramic materials can be joined in the green\nstate without a second phase by using low pressure injection molded parts. the investigation of the\njoining interface revealed that a high quality interface can be achieved by carefully adjusting the\ndifferent manufacturing steps. the use of monomodal particle size distribution in the used powder\nct3000sg is inferior to a broader particle size distribution obtained by replacing 33% of the finer\nalumina powder by coarser ct1200sg. in this way the dewaxing process is significantly improved\nwhen the wall thickness of the part exceeds 3 mm. the investigation of the mechanical properties of\nthe joined and sintered parts revealed, that the bending strength of the joined specimens achieved\nabout 80 % of the unjoined, monolithic specimens.",0.0
1,71287,"purpose\n in aspects, 10 brain regions are scored visually for presence of acute ischemic stroke damage. we evaluated automated aspects in comparison to expert readers.\n \n methods\n consecutive, baseline non-contrast ct-scans (5-mm slice thickness) from the prospective mr clean trial (n = 459, mr clean netherlands trial registry number: ntr1804) were evaluated. a two-observer consensus for aspects regions (normal/abnormal) was used as reference standard for training and testing (0.2/0.8 division). two other observers provided individual aspects-region scores. the automated aspects software was applied. a region score specificity of ≥ 90% was used to determine the software threshold for detection of an affected region based on relative density difference between affected and contralateral region. sensitivity, specificity, and receiver-operating characteristic curves were calculated. additionally, we assessed intraclass correlation coefficients (iccs) for automated aspects and observers in comparison to the reference standard in the test set.\n \n results\n in the training set (n = 104), with software thresholds for a specificity of ≥ 90%, we found a sensitivity of 33–49% and an area under the curve (auc) of 0.741–0.785 for detection of an affected aspects region. in the test set (n = 355), the results for the found software thresholds were 89–89% (specificity), 41–57% (sensitivity), and 0.750–0.795 (auc). comparison of automated aspects with the reference standard resulted in an icc of 0.526. comparison of observers with the reference standard resulted in an icc of 0.383–0.464.\n \n conclusion\n the performance of automated aspects is comparable to expert readers and could support readers in the detection of early ischemic changes.",0.0
2,18846,"although sulfonylurea agents have been used in the clinical management of type ii diabetes (non-insulin-dependent diabetes mellitus, niddm) for over two decades, the mechanisms responsible for their hypoglycemie action remain controversial. we have quantitated glycemie control, endogenous insulin secretion in response to mixed meals, adipocyte insulin binding, insulin-mediated peripheral glucose disposal, and basal hepatic glucose output in 17 type ii diabetic subjects before and after 3 mo of therapy with the second-generation, sulfonylurea compound glyburide in an attempt to identify the factors responsible for the clinical response to the drug. in addition, 9 subjects were treated for an additional 15 mo to see if the response to the drug changed with time.\n the mean fasting serum glucose level fell from an initial value of 264 ± 17 mg/dl to 178 ± 16 mg/dl after 3 mo of drug therapy. endogenous insulin secretion increased in all subjects, but the increase was most marked in those subjects who continued to exhibit fasting hyperglycemie (fasting serum glucose &amp;gt; 175 mg/dl) after 3 mo of therapy. adipocyte insulin binding was unchanged after 3 mo of therapy, while the maximal rate of peripheral glucose disposal was increased by 23%, indicating enhancement of peripheral insulin action at a postreceptor site(s). basal hepatic glucose output showed a significant correlation with the fasting serum glucose level both before and after therapy (r = 0.86, p &amp;lt; 0.001) and fell from 141 ±12 mg/m2/min before therapy to 107 ± 11 mg/m2/min after 3 mo of therapy. a significant correlation was also found between the decrease in the fasting glucose level and both the reduction in basal hepatic glucose output (r = 0.81, p &amp;lt; 0.001) and the enhancement of postreceptor function in peripheral tissues (r = 0.68, p &amp;lt; 0.005). after 18 mo of therapy, those subjects exhibiting an initial good response to the drug demonstrated a slight decrease in endogenous insulin secretion compared with the levels seen at 3 mo, adipocyte insulin binding had increased to the normal range, postreceptor function was further enhanced, and basal hepatic glucose output remained unchanged from the levels observed after 3 mo of therapy.\n we conclude that (1) glyburide therapy increases endogenous insulin secretion, increases adipocyte insulin binding after 18, but not 3, mo of therapy, enhances peripheral insulin action by acting primarily at a post-receptor site, and reduces basal hepatic glucose output; (2) the increase in postreceptor function and the reduction of basal hepatic glucose output appear to be the crucial determinants of the clinical response to the sulfonylurea agent; and (3) the response pattern to sulfonylurea compounds in terms of these various parameters can vary as a function of the duration of treatment.",1.0


We create our training and test datasets as a scikit-learn Bunch object

In [13]:
dataset = Bunch(
    data=df_sup['abstract'].values,
    target=df_sup['cdindex'].values,
    target_names=['cdindex'],
    DESCR='Scikit-learn dataset from dataframe'
)

In [14]:
test_dataset = Bunch(
    data=test_df['abstract'].values,
    target=test_df['cdindex'].values,
    target_names=['cdindex'],
    DESCR='Scikit-learn dataset from dataframe'
)

We shuffle our training data to prevent overfitting

In [None]:
dataset.data, dataset.target = shuffle(dataset.data, dataset.target, random_state=42)

For the non Neural Network approach I decided to use a booster. 

Specifically, an XGBooster. We create a Pipeline with a CountVectorizer, a tf-idf transformer and the XGBooster as the final layer to make the predictions

In [15]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', XGBRegressor()),
                    ])

We fit the model to our data.

In [17]:
text_clf.fit(dataset.data, dataset.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=None,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=None, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=100,
                   

In order to evaluate our model we use the following metrics:

1. Mean-Squared-Error: 

    *Average of the squared differences between the predicted values and the actual values. (Penalizes larger errors severely more)*

    $MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2$ , 


2. Mean Absolute Error: 

    *Average of the absolute differences between the predicted values and the actual values.*

    $MAE = \frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y}_i|$

3. Root Mean Squared Error: 

    *Square root of the MSE. It provides a more interpretable measure of the magnitude of the error, as it's expressed in the same units as the target variable.*

    $RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2}$

4. R Squared: 

    *Measures the proportion of the variance in the target variable that is explained by the model.*

    $R^2 = 1 - \frac{SS_{res}}{SS_{tot}}$

5. Adjusted R Squared: 

    *Takes into account the number of predictors in the model. It provides a more robust measure of the goodness of fit, as it adjusts for the number of predictors.*

    $Adj. R^2 = 1 - \frac{(1-R^2)(n-1)}{n-p-1}$

#### Where:

*$n$: Number of observations*

*$y_i$: actual value of endogenous variable* 

*$\hat{y}_i$: predicted value for the endogenous variable*

*$SS_{res} = \sum_{i=1}^{n}(y_i - \hat{y}_i)^2$*

*$SS_{tot} = \sum_{i=1}^{n}(y_i - \bar{y})^2$*


In [18]:
predicted = text_clf.predict(test_dataset.data)
r2 = r2_score(y_true=test_dataset.target, y_pred=predicted)
mae = mean_absolute_error(y_true=test_dataset.target, y_pred=predicted)
mse = mean_squared_error(y_true=test_dataset.target, y_pred=predicted)
adjusted_r2 = 1 - (1 - r2) * (len(test_df) - 1) / (len(test_df) - 1 - 1)
rmse = np.sqrt(mse)
print(f"MAE: {mae} \nMSE: {mse}\nR-squared: {r2}\nRoot Mean Squared Error {rmse}\nAdjusted R-squared: {adjusted_r2}")

MAE: 0.26523003279745855 
MSE: 0.11778570133007003
R-squared: 0.3435726543048321
Root Mean Squared Error 0.3431992152235637
Adjusted R-squared: 0.34349086669634776


Due to the complexity of the problem and the simplicity of our model, we get some alright results. However, we want to take it a step further and try to optimize our current model and particularly the booster.

Hence we will perform a grid search for the :
1. Learning Rate: Step size of the gradient descent algorithm

2. N Estimators: Number of trees in the model

3. Max Depth: Maximum depth of each tree

4. Reg Alpha: L1 (Lasso) regularization to prevent overfitting

5. Reg Lambda: L2 (Ridge) regularization to prevent overfitting

In [None]:
parameters = {
    'clf__learning_rate': [0.01, 0.1],
    'clf__n_estimators': [100, 500, 1000],
    'clf__max_depth': [3, 5, 7],
    'clf__reg_alpha': [0.1, 0.2],
    'clf__reg_lambda': [0.1, 0.2]
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=2)

In [None]:
gs_clf = gs_clf.fit(dataset.data, dataset.target)

We extract the best values for these hyperparameters

In [None]:
gs_clf.best_params_

{'clf__learning_rate': 0.1,
 'clf__max_depth': 3,
 'clf__n_estimators': 1000,
 'clf__reg_alpha': 0.2,
 'clf__reg_lambda': 0.2}

Create a new Pipeline with the hyperparameters set to these values.

In [19]:
text_clf_tuned = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=1000, reg_alpha=0.2, reg_lambda=0.2)),
                    ])

We fit the model to the data

In [20]:
text_clf_tuned.fit(dataset.data, dataset.target)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 XGBRegressor(base_score=None, booster=None, callbacks=None,
                              colsample_bylevel=None, colsample_bynode=None,
                              colsample_bytree=None, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              feature_types=None, gamma=None, gpu_id=None,
                              grow_policy=None, importance_type=None,
                              interaction_constraints=None, learning_rate=0.1,
                              max_bin=None, max_cat_threshold=None,
                              max_cat_to_onehot=None, max_delta_step=None,
                              max_depth=3, max_leaves=None,
                              min_child_weight=None, missing=nan,
                              monotone_constraints=None, n_estimators=1000,
                      

Evaluate the model.

In [21]:
predicted = text_clf_tuned.predict(test_dataset.data)
r2 = r2_score(y_true=test_dataset.target, y_pred=predicted)
mae = mean_absolute_error(y_true=test_dataset.target, y_pred=predicted)
mse = mean_squared_error(y_true=test_dataset.target, y_pred=predicted)
adjusted_r2 = 1 - (1 - r2) * (len(test_df) - 1) / (len(test_df) - 1 - 1)
rmse = np.sqrt(mse)
print(f"MAE: {mae} \nMSE: {mse}\nR-squared: {r2}\nRoot Mean Squared Error {rmse}\nAdjusted R-squared: {adjusted_r2}")

MAE: 0.2725061265042144 
MSE: 0.12209454814552814
R-squared: 0.31955917180111026
Root Mean Squared Error 0.34942030299558746
Adjusted R-squared: 0.3194743922311877


## Neural Network appoach with BERT (Bidirectional Encoder Representation from Transformers).

*The [paper](https://arxiv.org/abs/1810.04805) that inspired this approach*


<p align="center">
    <b>Transformer Architecture</b>
</p>
<p align="center">
    <img src="imgs/transformer_arch.png" alt="Image Description" width="350" height="500">
</p>


BERT makes use of the Transformer architecture, an attention mechanism that learns contextual relations between words (or sub-words) in a text. As opposed to directional models, which read text sequentially, the Transformer encoder reads the entire sequence of words at once, thus allowing the model to learn the context of a word based on its surroundings. BERT makes use of two techniques:
1. Masked Language Model (MLM)

2. Next Sentence Prediction (NSP)

However we will be implementing MLM BERT since we are interested in the contextual relationships between words.
MLM training comprises of replacing 15% of the words in each sentence with a < MASK > token and trying to predict said word based on the context of a sentence. As a result BERT trains on the notion of context. Something to underline is that the BERT loss function considers only the prediction of the masked values and ignores the prediction of non-masked values in order to better enhance BERT’s context awareness.
    
    Input: The < MASK1 > brown fox < MASK2 > over the lazy frog.
    
    Output: < MASK1 > = quick, < MASK2 > = jumped

<p align="center">
    <img src="imgs/bert_input.png">
</p>


Furthermore, since we have some unlabeled data, I decided to split the training process. Firtsly we will fine-tune BERT's weights to the context of our problem by feeding him the unlabeled data (label being the cdindex). This is a common approach that ensures that before we tackle the main problem BERT already has a sense of what's the context of the input, in hopes to make better predictions and speed up the training process. 

<p align="center">
    <img src="imgs/pre-fine.png">
</p>

What prompted me to use BERT for MLM, was to explore whether a context-based-only approach would work for our problem since the authors of the paper frequently underline the importance of context.

In order to further leverage BERT's way of training, I decided to concatenate the titles of the papers to the abstracts since most of the time the titles are indicative of the paper's context.

In [15]:
df_un = pd.read_csv('data/unsupervised_data.csv')
df_un.iloc[:1, 1:]

Unnamed: 0,title,abstract
0,Access Network Selection in a 4G Environment,"<jats:p>4G networks provide bandwidth of up to 1Gbps for a Mobile Node (MN) that is moving at pedestrian speed. On the other hand, it also supports mobile nodes that can move at a speed of 250 km/hr with bandwidths value of 100 Mbps. This sets the premise of a network that supports diverse needs. This goal will be harder to achieve if Network Selection Problems (NSP) are not addressed comprehensively. NSP refers to the selection of target access network selection from a collection of Candidate Networks (CNs) when MNs are moving from one access network into another. The most logical way of achieving this is to select the “best” network. This translates to identifying performance values of the CNs. The analysis in this chapter shows clearly that access network selection done based on limited criteria is detrimental in achieving optimum communication. Instead, this chapter suggests a framework that would be complementary to a 4G network.</jats:p>"


In [16]:
df_un['concatenated'] = df_un['title'] + '. ' + df_un['abstract']
df_un.at[0, 'concatenated']

'Access Network Selection in a 4G Environment. <jats:p>4G networks provide bandwidth of up to 1Gbps for a Mobile Node (MN) that is moving at pedestrian speed. On the other hand, it also supports mobile nodes that can move at a speed of 250 km/hr with bandwidths value of 100 Mbps. This sets the premise of a network that supports diverse needs. This goal will be harder to achieve if Network Selection Problems (NSP) are not addressed comprehensively. NSP refers to the selection of target access network selection from a collection of Candidate Networks (CNs) when MNs are moving from one access network into another. The most logical way of achieving this is to select the “best” network. This translates to identifying performance values of the CNs. The analysis in this chapter shows clearly that access network selection done based on limited criteria is detrimental in achieving optimum communication. Instead, this chapter suggests a framework that would be complementary to a 4G network.</jat

In [17]:
df_un['concatenated'] = df_un['concatenated'].apply(lambda x: re.sub('<.*?>', '', str(x)).strip())
df_un.at[0, 'concatenated']

'Access Network Selection in a 4G Environment. 4G networks provide bandwidth of up to 1Gbps for a Mobile Node (MN) that is moving at pedestrian speed. On the other hand, it also supports mobile nodes that can move at a speed of 250 km/hr with bandwidths value of 100 Mbps. This sets the premise of a network that supports diverse needs. This goal will be harder to achieve if Network Selection Problems (NSP) are not addressed comprehensively. NSP refers to the selection of target access network selection from a collection of Candidate Networks (CNs) when MNs are moving from one access network into another. The most logical way of achieving this is to select the “best” network. This translates to identifying performance values of the CNs. The analysis in this chapter shows clearly that access network selection done based on limited criteria is detrimental in achieving optimum communication. Instead, this chapter suggests a framework that would be complementary to a 4G network.'

We import a BERT tokenizer and an instance of a pre-trained MLM BERT.

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Afterwards we make a list of the *concatenated* column and pass it to the tokenizer in order to create some < MASK > tokens within our training data.

(We pad any sentences that are less than 512 and truncate these that are greater than our threshold)

In [19]:
data_list = df_un['concatenated'].tolist()
data_list[0]

'Access Network Selection in a 4G Environment. 4G networks provide bandwidth of up to 1Gbps for a Mobile Node (MN) that is moving at pedestrian speed. On the other hand, it also supports mobile nodes that can move at a speed of 250 km/hr with bandwidths value of 100 Mbps. This sets the premise of a network that supports diverse needs. This goal will be harder to achieve if Network Selection Problems (NSP) are not addressed comprehensively. NSP refers to the selection of target access network selection from a collection of Candidate Networks (CNs) when MNs are moving from one access network into another. The most logical way of achieving this is to select the “best” network. This translates to identifying performance values of the CNs. The analysis in this chapter shows clearly that access network selection done based on limited criteria is detrimental in achieving optimum communication. Instead, this chapter suggests a framework that would be complementary to a 4G network.'

In [20]:
inputs = tokenizer(data_list, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs

{'input_ids': tensor([[  101,  3229,  2897,  ...,     0,     0,     0],
        [  101,  4483, 19380,  ...,     0,     0,     0],
        [  101,  4106,  1997,  ...,     0,     0,     0],
        ...,
        [  101,  4613, 22334,  ...,     0,     0,     0],
        [  101,  2203, 14573,  ..., 15136,  2509,   102],
        [  101,  4254,  1997,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])}

Next we pass the inputs as the labels since we are only training to bring context awareness to our model, where the input is the label.

In [21]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs

{'input_ids': tensor([[  101,  3229,  2897,  ...,     0,     0,     0],
        [  101,  4483, 19380,  ...,     0,     0,     0],
        [  101,  4106,  1997,  ...,     0,     0,     0],
        ...,
        [  101,  4613, 22334,  ...,     0,     0,     0],
        [  101,  2203, 14573,  ..., 15136,  2509,   102],
        [  101,  4254,  1997,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[  101,  3229,  2897,  ...,     0,     0,     0],
        [  101,  4483, 19380,  ...,     0,     0,     0],
        [  101,  4106, 

Here we create our < MASK > tokens with 15 % probability. However we do not want to take the CLS (101) and SEP (102) tokens into consideration, hence the condition.

In [22]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False,  True,  True,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

Here we retrieve the index possitions of the True values (the words that will be masked) and proceed to apply the < MASK > token by inserting the numeric value of the token, which is 103

In [23]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [24]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

As we are working with our own custom dataset, we have to create a class that inherits form *torch.utils.data.Dataset* in order to train our model. This class must define three methods: 
1. __ init__()
2. __ getitem__()
3. __ len__()

In [25]:
class UnsupervisedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings.input_ids)

We instanciate our dataset and split it into into training and validation subsets using an 80 - 20 ratio.

In [26]:
dataset = UnsupervisedDataset(inputs)

In [27]:
val_ratio = 0.2
train_dataset, val_dataset, _, _ = train_test_split(dataset, range(len(dataset)), test_size=val_ratio, random_state=42)

len(train_dataset), len(val_dataset)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


(9149, 2288)

We also instanciate two *torch.utils.data.DataLoader* objects, one for each subset of our data, The *torch.utils.data.Dataset* retrieves our dataset’s features and labels one sample at a time. While training a model, we typically want to pass samples in “minibatches”, reshuffle the data at every epoch to reduce model overfitting, and use Python’s multiprocessing to speed up data retrieval. DataLoader is an iterable that abstracts this complexity for us in an easy API.

In [28]:
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=8)

We also have some device-agnostic code and put the model on that device.

In [29]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [30]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

Instanciate our optimizer

In [31]:
optim = AdamW(model.parameters(), lr=5e-5)



Start the training procedure. 

Since our problem is pretty complex it is very easy for our model to overfit to the training data, therefore we must take some measures to counter that. We introduce two variables *counter* and *patience*. *Counter* keeps track of the number of epochs that passed since our model last made a new validation loss minimum, while *patience* is a threshold that reflects the maximum number of epochs that we allow our model to keep training while performing worse in its validation subset.

In [None]:
epochs = 7
training_loss = []
validation_loss = []
best_loss = 1000000
counter = 0
patience = 4
for epoch in tqdm(range(epochs), desc="Epoch"):
    print(f'Epoch {epoch + 1}')
    # Toggle model on training mode
    model.train()
    current_training_loss = 0
    # Iterate DataLoader
    for i, batch in enumerate(train_dl):
        # Clear optimizer
        optim.zero_grad()
        # Extract features, attention mask and labels
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # Foward Feed
        outputs = model(input_ids, attention_mask=attention_mask,labels=labels)
        # Calculate Loss
        loss = outputs.loss
        # Backpropagation
        loss.backward()
        # Update parameters
        optim.step()
        current_training_loss += loss.item()
        # Every 1000 batches
        if i % 1000 == 999:
          last_loss = current_training_loss / 1000 # loss per batch
          print(f'Batch: {i + 1} Training Loss: {last_loss}')
    avg_loss = current_training_loss / (i + 1)
    training_loss.append(avg_loss)

    # Toggle model on evaluation mode
    model.eval()
    val_loss = 0
    # Iterate DataLoader
    for i, batch in enumerate(val_dl):
        # Extract features, attention mask and labels
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # Foward Feed
        outputs = model(input_ids, attention_mask=attention_mask,labels=labels)
        # Calculate Loss
        loss = outputs.loss
        val_loss += loss.item()

    avg_val_loss = val_loss / (i + 1)
    validation_loss.append(avg_val_loss)
    print(f'Average Training Loss: {avg_loss : .3f}')
    print(f'Average Validation Loss:{avg_val_loss: .3f}')
    # Based on model's improvement, store weights or track overfitting
    if avg_val_loss < best_loss:
        best_epoch = epoch + 1
        best_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), f'model_{epoch + 1}.pt')
    else:
        counter += 1
    # Assess threshold breach
    if counter >= patience:
        print(f'Early stopping at epoch {epoch + 1} due to overfitting.')
        print(f'Best model occured on epoch: {best_epoch}')
        break

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch 1
Batch: 1000 Training Loss: 0.2849548095390201
Average Training Loss:  0.275
Average Validation Loss: 0.193
Epoch 2
Batch: 1000 Training Loss: 0.17069116331636905
Average Training Loss:  0.171
Average Validation Loss: 0.190
Epoch 3
Batch: 1000 Training Loss: 0.13633696671947837
Average Training Loss:  0.137
Average Validation Loss: 0.195
Epoch 4
Batch: 1000 Training Loss: 0.10752261646091937
Average Training Loss:  0.108
Average Validation Loss: 0.202
Epoch 5
Batch: 1000 Training Loss: 0.08262827629968524
Average Training Loss:  0.083
Average Validation Loss: 0.212
Epoch 6
Batch: 1000 Training Loss: 0.06235021597146988
Average Training Loss:  0.063
Average Validation Loss: 0.222
Early stopping at epoch 6 due to overfitting.
Best model occured on epoch: 2


In [None]:
training_loss, validation_loss

([0.275035149202897,
  0.17073363604974914,
  0.13669991432982204,
  0.10800728588071945,
  0.08317175218705218,
  0.06327083100519189],
 [0.1931126386291914,
  0.1903761250155789,
  0.1950919846711042,
  0.2020932961437669,
  0.21203507412667874,
  0.22225573440114935])

Now we will tackle predicting the value of the cdindex.

We create our model's class that inherits from *nn.Module* and that comprises of our context fine-tuned model and three fully connected layers stacked on top. The first two have a ReLU activation function in an attempt to catch more complex relationships in the data whilst the last has a linear activation in order to predict the value of the index.  In addition to that we incorporate some *Dropout* layers to combat overfitting.

In [32]:
class IndexPredictor(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.dropout1 = nn.Dropout(p=0.2)
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc1 = nn.Linear(in_features=30522, out_features=768)
        self.fc2 = nn.Linear(in_features=768, out_features=512)
        self.fc3 = nn.Linear(in_features=512, out_features=1)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0] # (batch_size, sequence_length, hidden_size)
        pooled_output = last_hidden_state[:, 0] # (batch_size, hidden_size)
        index = self.dropout1(F.relu(self.fc1(pooled_output)))
        index = self.dropout2(F.relu(self.fc2(index)))
        return self.fc3(index)

# Load fine-tuned BERT model
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
bert_model.load_state_dict(torch.load('models/model_2.pt', map_location=device))
# Create the new IndexPredictor model
model2 = IndexPredictor(bert_model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


We instanciate a DataFrame object containing all the abstracts that have a *cdindex* value

In [33]:
df_sup = pd.read_csv('data/data.csv')
df_sup.drop(columns='Unnamed: 0', inplace=True)
df_sup.head(5)

Unnamed: 0,abstract,cdindex
0,"The photolysis of dilute solutions of octacyclosulphur or hexacyclosulphur in n-hexane with 253.6 nm UV radiation produces S and possibly S2. The ‘ring-opening’ yields of these sulphur molecules range from 0.2 to 0.7. When the hydrogen end-capped polyyne C10H2 is irradiated in n-hexane, it transforms into unidentified products with a quantum yield of 3×10−5. When octacyclosulphur is added to the solution, the yield rises to 7×10−3. The putative sulphur-bearing product(s) could not be identified. It is suggested that sulphur-bearing molecules might be formed in astronomical settings by reactions of carbon molecules having triple or double C—C bonds with photolytically produced S and/or S2.",0.0
1,"Examples are given of unforeseen damage or difficulties arising during or after construction of buildings, wharves, foundations, tunnels, in-ground storage, pipe lines, and embankments. The underlying potentially latent mechanisms of groundwater are classified into four groups, namely water pressure changes, physical changes, erosion, and affects of earthworks, in order to assist early identification of some of the types of change in ground properties which may be induced by site operations and may lead to damage. It is concluded that otherwise unforeseen damage is more likely to be averted by a greater degree of investigation during construction into the consequences of the construction sequences, method and plant used.",0.0
2,"Acute hypoxia causes pulmonary vasoconstriction and coronary vasodilation. The divergent effects of hypoxia on pulmonary and coronary vascular smooth muscle cells suggest that the mechanisms involved in oxygen sensing and downstream effectors are different in these two types of cells. Since production of reactive oxygen species (ROS) is regulated by oxygen tension, ROS have been hypothesized to be a signaling mechanism in hypoxia-induced pulmonary vasoconstriction and vascular remodeling. Furthermore, an increased ROS production is also implicated in arteriosclerosis. In this study, we determined and compared the effects of hypoxia on ROS levels in human pulmonary arterial smooth muscle cells (PASMC) and coronary arterial smooth muscle cells (CASMC). Our results indicated that acute exposure to hypoxia (Po2 = 25–30 mmHg for 5–10 min) significantly and rapidly decreased ROS levels in both PASMC and CASMC. However, chronic exposure to hypoxia (Po2 = 30 mmHg for 48 h) markedly increased ROS levels in PASMC, but decreased ROS production in CASMC. Furthermore, chronic treatment with endothelin-1, a potent vasoconstrictor and mitogen, caused a significant increase in ROS production in both PASMC and CASMC. The inhibitory effect of acute hypoxia on ROS production in PASMC was also accelerated in cells chronically treated with endothelin-1. While the decreased ROS in PASMC and CASMC after acute exposure to hypoxia may reflect the lower level of oxygen substrate available for ROS production, the increased ROS production in PASMC during chronic hypoxia may reflect a pathophysiological response unique to the pulmonary vasculature that contributes to the development of pulmonary vascular remodeling in patients with hypoxia-associated pulmonary hypertension.",-0.002938
3,"Despite rapid advances in the stem cell field, the ability to identify and track transplanted or migrating stem cells in vivo is limited. To overcome this limitation, we used magnetic resonance imaging (MRI) to detect and follow transplanted stem cells over a period of 28 days in mice using an established myocardial infarction model. Pluripotent mouse embryonic stem (mES) cells were expanded and induced to differentiate into beating cardiomyocytes in vitro. The cardiac-differentiated mES cells were then loaded with superparamagnetic fluorescent microspheres (1.63 μm in diameter) and transplanted into ischemic myocardium immediately following ligation and subsequent reperfusion of the left anterior descending coronary artery. To identify the transplanted stem cells in vivo, MRI was performed using a Varian Inova 4.7 Tesla scanner. Our results show that (a) the cardiac-differentiated mES were effectively loaded with superparamagnetic microspheres in vitro, (b) the microsphere-loaded mES cells continued to beat in culture prior to transplantation, (c) the transplanted mES cells were readily detected in the heart in vivo using noninvasive MRI techniques, (d) the transplanted stem cells were detected in ischemic myocardium for the entire 28-day duration of the study as confirmed by MRI and post-mortem histological analyses, and (e) concurrent functional MRI indicated typical loss of cardiac function, although significant amelioration of remodeling was noted after 28 days in hearts that received transplanted stem cells. These results demonstrate that it is feasible to simultaneously track transplanted stem cells and monitor cardiac function in vivo over an extended period using noninvasive MRI techniques.\n Disclosure of potential conflicts of interest is found at the end of this article.",-0.004497
4,"For the regenerative therapy of osteochondral defects – deep lesions of the articular cartilage in which the underlying bone tissue is already affected too – special implant materials and scaffolds are needed. In this study, two new approaches will be presented, leading to biphasic, but monolithic scaffold materials. Both consist of a mineralised layer for filling of the bony part of the defect and a non-mineralised one for the chondral part. Due to the preparation methods, both layers are fused together to give a unified whole without need of any artificial joining. The resulting materials, either based on collagen, hyaluronic acid and hydroxyapatite or calcium alginate gels and hydroxyapatite, seem to be suitable scaffolds for cultivating chondrocytes and osteoblasts – and therefore can act as matrices for tissue engineering of osteochondral grafts.",-0.004373


As for the data preprocessing we follow the same procedure as before.

We convert our data to a list and pass it to the BERT tokenizer object.

In [34]:
tokenizer1 = BertTokenizer.from_pretrained('bert-base-uncased')

In [35]:
df_sup['abstract'] = df_sup['abstract'].astype(str)

In [36]:
data_list = df_sup['abstract'].tolist()
data_list[0]

'The photolysis of dilute solutions of octacyclosulphur or hexacyclosulphur in n-hexane with 253.6\xa0nm UV radiation produces S and possibly S2. The ‘ring-opening’ yields of these sulphur molecules range from 0.2 to 0.7. When the hydrogen end-capped polyyne C10H2 is irradiated in n-hexane, it transforms into unidentified products with a quantum yield of 3×10−5. When octacyclosulphur is added to the solution, the yield rises to 7×10−3. The putative sulphur-bearing product(s) could not be identified. It is suggested that sulphur-bearing molecules might be formed in astronomical settings by reactions of carbon molecules having triple or double C—C bonds with photolytically produced S and/or S2.'

In [37]:
inputs = tokenizer1(data_list, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs

{'input_ids': tensor([[  101,  1996,  6302,  ...,     0,     0,     0],
        [  101,  4973,  2024,  ...,     0,     0,     0],
        [  101, 11325,  1044,  ...,     0,     0,     0],
        ...,
        [  101,  2044, 16175,  ...,     0,     0,     0],
        [  101,  2023,  3720,  ...,     0,     0,     0],
        [  101, 12654, 12399,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

We create the mask array

In [38]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False,  True,  ..., False, False, False],
        [False,  True,  True,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

We replace the True values with the < MASK > token (103)

In [39]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [40]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

We create a new class that inherits from *torch.utils.data.Dataset*. However we modify the __ getitem__() method to also retrive the value of the cdindex from every abstract.

In [41]:
class CdindexDataset1(torch.utils.data.Dataset):
    def __init__(self, encodings, Y):
        self.encodings = encodings
        self.Y = torch.tensor(Y)
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.Y[idx]
    
    def __len__(self):
        return len(self.encodings.input_ids)

In [42]:
dataset = CdindexDataset1(inputs, df_sup['cdindex'])

We split the data into training and validation subsets, using an 80 - 20 ratio respectively

In [43]:
val_ratio = 0.2
train_dataset, val_dataset, _, _ = train_test_split(dataset, range(len(dataset)), test_size=val_ratio, random_state=42)

len(train_dataset), len(val_dataset)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.Y[idx]


(64224, 16057)

We instanciate a DataLoader object for each data subset.

In [44]:
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=True)

We write some device agnostic code

In [45]:
device = torch.device('cuda') if torch.cuda.is_available else torch.device('cpu')
device

device(type='cuda')

In [47]:
model2.to(torch.device('cpu'))

IndexPredictor(
  (bert): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
           

We instanciate our loss function and optimizer. This time though, in order to further combat overfitting we introduce weight decay.

In [48]:
optim = AdamW(model2.parameters(), lr=5e-5, weight_decay=0.01)
criterion = nn.MSELoss()



We start the training process.

As above we have the same overfitting counter-actions, using the *patience* and *counter* variables in order to keep track of our models performance on the validation subset and stop training when we detect 2 epochs where our model has not shown any improvements.

In [None]:
epochs = 7
training_loss = []
validation_loss = []
best_loss = 1000000
counter = 0
patience = 2
# Iterate DataLoader
for epoch in tqdm(range(epochs), desc="Epoch"):
      # Track training loss per epoch
      train_loss = 0
      # Toggle model on training mode
      model2.train()
      print(f'Epoch: {epoch}')
      for i, (inputs, labels) in enumerate(train_dl):
        # Clear optimizer
        optim.zero_grad()
        # Extract features, attention mask, labels
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)
        # Foward Feed
        outputs = model2(input_ids, attention_mask=attention_mask)
        # Calculate loss
        loss = criterion(outputs, torch.unsqueeze(labels.float(), 1))
        # Backpropagation
        loss.backward()
        # Update Parameters
        optim.step()
        # Every 1000 batches 
        train_loss += loss.item()
        if i % 1000 == 999:
          last_loss = train_loss / (i + 1)
          print(f'Batch {i + 1} / {len(train_dl)} Training Loss: {last_loss}')

      avg_loss = train_loss / (i + 1)
      training_loss.append(avg_loss / (i + 1))
      # Track validation loss per epoch
      val_loss = 0
      # Toggle model on evaluation mode
      model2.eval()
      # Iterate DataLoader
      for i, (inputs, labels) in enumerate(val_dl):
          # Extract features, attention mask, labels
          input_ids = inputs['input_ids'].to(device)
          attention_mask = inputs['attention_mask'].to(device)
          labels = labels.to(device)
          # Foward Feed
          outputs = model2(input_ids, attention_mask=attention_mask)
          # Calculate Loss
          loss = criterion(outputs, torch.unsqueeze(labels.float(), 1))
          val_loss += loss.item()
          # Every 1000 batches
          if i % 1000 == 999:
            last_loss = val_loss / (i + 1)
            print(f'Batch {i + 1} / {len(val_dl)} Validation Loss: {last_loss}')
      # Track losses for the whole process
      avg_val_loss = val_loss / (i + 1)
      validation_loss.append(avg_val_loss)
      print(f'Average Training Loss: {avg_loss : .3f}')
      print(f'Average Validation Loss:{avg_val_loss: .3f}')
      # Based on model's improvement, store weights or track overfitting
      if avg_val_loss < best_loss:
          best_epoch = epoch + 1
          best_loss = avg_val_loss
          counter = 0
          torch.save(model2.state_dict(), f'model_final_{best_epoch}.pt')
          print('Model Saved')
      else:
          counter += 1
      # Assess threshold breach
      if counter >= patience:
          print(f'Early stopping at epoch {epoch + 1} due to overfitting.')
          print(f'Best model occured on epoch: {best_epoch}')
          break

Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch: 0
Batch 1000 / 7226 Training Loss: 0.3072888724477962
Batch 2000 / 7226 Training Loss: 0.24175022287806497
Batch 3000 / 7226 Training Loss: 0.21568215244814443
Batch 4000 / 7226 Training Loss: 0.2007666713037179
Batch 5000 / 7226 Training Loss: 0.19166807731366717
Batch 6000 / 7226 Training Loss: 0.18500273987813853
Batch 7000 / 7226 Training Loss: 0.18002687109737392
Batch 1000 / 2008 Validation Loss: 0.14455048449058086
Batch 2000 / 2008 Validation Loss: 0.1421097921030596
Average Training Loss:  0.179
Average Validation Loss: 0.142


Epoch:  14%|█▍        | 1/7 [1:56:46<11:40:36, 7006.16s/it]

Model Saved
Epoch: 1
Batch 1000 / 7226 Training Loss: 0.1470269664605148
Batch 2000 / 7226 Training Loss: 0.14454330142971594
Batch 3000 / 7226 Training Loss: 0.1439209230328755
Batch 4000 / 7226 Training Loss: 0.1431658864080091
Batch 5000 / 7226 Training Loss: 0.1418484735670034
Batch 6000 / 7226 Training Loss: 0.14669036462534374
Batch 7000 / 7226 Training Loss: 0.15180490126264548
Batch 1000 / 2008 Validation Loss: 0.18107159739732742
Batch 2000 / 2008 Validation Loss: 0.18090481951925905


Epoch:  29%|██▊       | 2/7 [3:52:30<9:40:50, 6970.06s/it] 

Average Training Loss:  0.153
Average Validation Loss: 0.181
Epoch: 2
Batch 1000 / 7226 Training Loss: 0.17730716173909605
Batch 2000 / 7226 Training Loss: 0.17897245545592158
Batch 3000 / 7226 Training Loss: 0.17807863775640725


*Since training took place in Google Colab with limited resourses, I only managed to get 3 epochs before getting my runtime shutdown by Google. This plays a significant role to the model's evaluation. In addition to that the best validation loss occured on the first epoch and hence thats the model we are using (meaning its only trained for one epoch)*

Time to evaluate our model

We convert our data to a list and pass it to the BERT tokenizer object.

In [49]:
data_list = test_df['abstract'].tolist()
data_list[0]

'it has been successfully demonstrated that ceramic materials can be joined in the green\nstate without a second phase by using low pressure injection molded parts. the investigation of the\njoining interface revealed that a high quality interface can be achieved by carefully adjusting the\ndifferent manufacturing steps. the use of monomodal particle size distribution in the used powder\nct3000sg is inferior to a broader particle size distribution obtained by replacing 33% of the finer\nalumina powder by coarser ct1200sg. in this way the dewaxing process is significantly improved\nwhen the wall thickness of the part exceeds 3 mm. the investigation of the mechanical properties of\nthe joined and sintered parts revealed, that the bending strength of the joined specimens achieved\nabout 80 % of the unjoined, monolithic specimens.'

In [50]:
inputs = tokenizer1(data_list, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
inputs

{'input_ids': tensor([[  101,  2009,  2038,  ...,     0,     0,     0],
        [  101,  3800,  1999,  ...,     0,     0,     0],
        [  101,  2348, 21396,  ...,  2013,  1996,   102],
        ...,
        [  101,  4372,  5794,  ...,     0,     0,     0],
        [  101,  1037,  1012,  ...,     0,     0,     0],
        [  101,  2057,  3189,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

We create the mask array

In [51]:
rand = torch.rand(inputs.input_ids.shape)
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

We replace the True values with the < MASK > token (103)

In [52]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [53]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [54]:
class CdindexDataset1(torch.utils.data.Dataset):
    def __init__(self, encodings, Y):
        self.encodings = encodings
        self.Y = torch.tensor(Y)
    
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.Y[idx]
    
    def __len__(self):
        return len(self.encodings.input_ids)

We instanciate our Dataset and DataLoader objects

In [55]:
test_dataset = CdindexDataset1(inputs, test_df['cdindex'])

In [56]:
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=8)

Evaluation process

In [57]:
# Load saved model
model2.load_state_dict(torch.load('models/model_fc3_1.pt', map_location=device))

RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.

In [None]:
criterion = nn.MSELoss()
test_loss = 0
y_pred = []
y_true = []
for i, (inputs, labels) in enumerate(test_dl):
  with torch.inference_mode():
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)
    outputs = model2(input_ids, attention_mask=attention_mask)
    loss = criterion(outputs, torch.unsqueeze(labels.float(), 1))
    test_loss += loss.item()
    y_true.extend(labels.detach().cpu().numpy())
    y_pred.extend(outputs.detach().cpu().numpy())
    if i % 999 == 1000:
      print(f'Batch {i + 1} Average Test MSE: {test_loss / (i + 1)}')
print(f'Average Test Loss for {i + 1} samples is {test_loss / len(test_dl)}')

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.Y[idx]


Average Test Loss for 1004 samples is 0.14523824602152144


In [None]:
r2 = r2_score(y_true=y_true, y_pred=y_pred)
adjusted_r2 = 1 - (1 - r2) * (len(test_df) - 1) / (len(test_df) - 1 - 1) 
#adjusted_r2 = 0
mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true=y_true, y_pred=y_pred)
print(f"Mean Absolute Error: {mae} \nMean Squared Error: {mse}\nRoot Mean Squared Error: {rmse}\nR-squared: {r2}\nAdjusted R-squared: {adjusted_r2}")

Mean Absolute Error: 0.28285327523430587 
Mean Squared Error: 0.14519462385321585
Root Mean Squared Error: 0.38104412323668746
R-squared: 0.1908209530621291
Adjusted R-squared: 0.19072013334534144


*Since I am working from a laptop without an nvidia gpu, I had to resort to Google Colab in order to access *cuda* gpus. However Colab's resourses are limited and that was a huge burden during this implementation. I believe that with more epochs during training, more layers, incorporation of a tf-idf vectorizer as well as more computational power the model would truly shine.*

*Also it would be very beneficial to incorporate the number of citations per paper as a feature for our model, as underlined by the authors. However due to time limitations I was not able to pull it off.*