# **TASK 2**

# 0. Setup

## 0.1 Import libraries.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:0

In [3]:
import numpy as np
import pandas as pd
import os

import random

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

from sklearn.utils import resample
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from numpy import where
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

## 0.2 Utilities.

In [4]:
df_path = '/content/drive/MyDrive/progetto-ml/data/'

In [5]:
save_df = True

In [6]:
n_components = 1
empty_element = 0.0
month_interval = 3
model_month_interval = True

In [7]:
remove_history_all = True
months_to_remove = 6

In [8]:
cardiovascular_events = ['AMD047', 'AMD048', 'AMD049', 'AMD071', 'AMD081', 'AMD082', 'AMD208', 'AMD303']

In [9]:
relevant_columns = ['idcentro', 'idana', 'data', 'codiceamd', 'valore', 'sesso', 'annodiagnosidiabete', 'scolarita', 'statocivile', 'professione', 'origine', 'annonascita', 'label']

In [10]:
def diff_month(d1, d2):
    return abs((d1.year - d2.year) * 12 + d1.month - d2.month)

In [11]:
def get_age(row):
    anno_nascita = row['annonascita']
    anno = row['data'].year
    age = anno - anno_nascita
    return age

## 0.3 Load in the datasets.

In [None]:
final_df = pd.read_csv(df_path+'final_df.csv')

In [None]:
labeled_patients = pd.read_csv(df_path+'final_patients.csv')

# 1. Class Imbalance (part I)

Not all patients will have a cardiovascular event within the stabilised six-month period. Thus, we would expect that the class distribution is highly imbalanced. For each patient pi such that y(pi) = 1, eliminate the last six months of history to avoid giving the model prediction hints into the future. For each patient pi such that y(pi) = 2, create m copies {p1i , . . . , pmi } such that all the cardiovascular events in the last six months of pji ∀i∈[1,|P|]∀j∈[1,m] are eliminated, and the other events are shuffled and cancelled at random. In this way, you have a sort of balancing criterion (i.e., up-sampling the minority class).

In [None]:
labeled_patients['label'].value_counts()

0    61794
1     6672
Name: label, dtype: int64

In [None]:
label_0 = labeled_patients.loc[labeled_patients['label'] == 0].shape[0]
label_1 = labeled_patients.loc[labeled_patients['label'] == 1].shape[0]
imbalance = label_1/label_0
imbalance

0.1079716477327896

## 1.1. Delete History

For each patient pi such that y(pi) = 1, eliminate the last six months of history to avoid giving the model prediction hints into the future.

In [None]:
label = 1

In [None]:
patients_latest_event = pd.read_csv(df_path+'latest_event.csv')
six_months_ago = patients_latest_event
six_months_ago['six_months'] = six_months_ago['data'].apply(lambda x : pd.Timestamp(x) - pd.DateOffset(months=months_to_remove))
six_months_ago = six_months_ago[['idcentro','idana', 'six_months']]
del patients_latest_event
six_months_ago.head()

Unnamed: 0,idcentro,idana,six_months
0,190,889,2019-05-15
1,268,1258,2019-04-01
2,14,408,2019-03-30
3,268,689,2019-03-30
4,269,2940,2019-03-30


In [None]:
no_history_df = final_df[relevant_columns]
no_history_df.head()

Unnamed: 0,idcentro,idana,data,codiceamd,valore,sesso,annodiagnosidiabete,scolarita,statocivile,professione,origine,annonascita,label
0,1,5,1980-01-01,AMD247,491.2,M,1986.0,2.0,2.0,9.0,,1942,0
1,1,5,1986-01-01,AMD247,272.0,M,1986.0,2.0,2.0,9.0,,1942,0
2,1,5,1987-01-01,AMD083,,M,1986.0,2.0,2.0,9.0,,1942,0
3,1,5,1987-01-01,AMD247,401.0,M,1986.0,2.0,2.0,9.0,,1942,0
4,1,5,1997-12-01,AMD247,410.0,M,1986.0,2.0,2.0,9.0,,1942,0


In [None]:
# merge final_df with the six_moths_ago df
no_history_df = pd.merge(no_history_df, six_months_ago, on=['idcentro', 'idana'], how='inner')
no_history_df.head(), no_history_df.shape[0]

(   idcentro  idana        data codiceamd  valore sesso  annodiagnosidiabete  \
 0         1      5  1980-01-01    AMD247  491.20     M               1986.0   
 1         1      5  1986-01-01    AMD247   272.0     M               1986.0   
 2         1      5  1987-01-01    AMD083     NaN     M               1986.0   
 3         1      5  1987-01-01    AMD247     401     M               1986.0   
 4         1      5  1997-12-01    AMD247     410     M               1986.0   
 
    scolarita  statocivile  professione  origine  annonascita  label six_months  
 0        2.0          2.0          9.0      NaN         1942      0 2013-06-30  
 1        2.0          2.0          9.0      NaN         1942      0 2013-06-30  
 2        2.0          2.0          9.0      NaN         1942      0 2013-06-30  
 3        2.0          2.0          9.0      NaN         1942      0 2013-06-30  
 4        2.0          2.0          9.0      NaN         1942      0 2013-06-30  ,
 11591301)

In [None]:
if remove_history_all:
    no_history_df = no_history_df[(no_history_df['data'] >= no_history_df['six_months'])]
else:
    no_history_df = no_history_df[(no_history_df['label'] != label) | (no_history_df['data'] >= no_history_df['six_months'])]
no_history_df.drop(columns = ["six_months"], inplace=True)
no_history_df.head(), no_history_df.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_history_df.drop(columns = ["six_months"], inplace=True)


(     idcentro  idana        data codiceamd valore sesso  annodiagnosidiabete  \
 142         1      5  2013-10-01    AMD049      S     M               1986.0   
 143         1      5  2013-10-01    AMD247  36.10     M               1986.0   
 144         1      5  2013-10-01    AMD247  37.94     M               1986.0   
 145         1      5  2013-10-01    AMD247    401     M               1986.0   
 146         1      5  2013-10-01    AMD247    402     M               1986.0   
 
      scolarita  statocivile  professione  origine  annonascita  label  
 142        2.0          2.0          9.0      NaN         1942      0  
 143        2.0          2.0          9.0      NaN         1942      0  
 144        2.0          2.0          9.0      NaN         1942      0  
 145        2.0          2.0          9.0      NaN         1942      0  
 146        2.0          2.0          9.0      NaN         1942      0  ,
 1340679)

In [None]:
if save_df:
    no_history_df.to_csv(df_path+'no_history_df.csv')

In [None]:
del final_df
del no_history_df
del six_months_ago

## 1.2. Up-sampling

For each patient pi such that y(pi) = 1, create m copies {p1i , . . . , pmi } such that all the cardiovascular events in the last six months of pji ∀i∈[1,|P|]∀j∈[1,m] are eliminated, and the other events are shuffled and cancelled at random. In this way, you have a sort of balancing criterion (i.e., up-sampling the minority class).

In [None]:
final_df = pd.read_csv(df_path+'no_history_df.csv')

In [None]:
labeled_patients['label'].value_counts()

0    61794
1     6672
Name: label, dtype: int64

Up-sampling parameters

In [None]:
label = 1

In [None]:
m = 2 # number of copies
del_probability = 0.05 # 5%
max_idcentro = final_df.loc[final_df['idcentro'].idxmax()]['idcentro']
max_idcentro

500

In [None]:
df = final_df
del final_df
label_1_patients = labeled_patients[labeled_patients['label'] == 1][['idcentro', 'idana']]

In [None]:
for i in range(m):
    patients_copies = df[(df['idcentro'] <= max_idcentro) & (df['label'] == 1)]
    patients_copies['idcentro'] = patients_copies['idcentro'] + max_idcentro + i + 1

    for index, row in label_1_patients.iterrows():
        idcentro = row[0] + max_idcentro + i + 1
        idana = row[1]

        tmp_df = patients_copies[(patients_copies['idcentro'] == idcentro) & (patients_copies['idana'] == idana)]
        # shuffle dates
        tmp_df['data'].sample(frac=1).reset_index(drop=True)
        tmp_df.sample(frac=(1-del_probability)).reset_index(drop=True)

        df = pd.concat([df, tmp_df])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patients_copies['idcentro'] = patients_copies['idcentro'] + max_idcentro + i + 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patients_copies['idcentro'] = patients_copies['idcentro'] + max_idcentro + i + 1


In [None]:
if save_df:
    df.to_csv(df_path+'naive_upsampled.csv')
del df
del tmp_df
del patients_copies
del label_1_patients

# 2. PubMedBert

## 2.1 Preprocessing

In [None]:
!pip install sentence_transformers

In [None]:
final_df = pd.read_csv(df_path+'naive_upsampled.csv')

In [None]:
final_df['data'] = pd.to_datetime(final_df['data'])
final_df['eta'] = final_df.apply(lambda row: get_age(row), axis=1)

In [None]:
final_df = final_df.iloc[:,1:]
final_df.drop(columns = ["annonascita", "annodiagnosidiabete"], inplace=True)

In [None]:
final_df['scolarita'].value_counts()
final_df['scolarita'].fillna(0, inplace=True)

In [None]:
final_df['statocivile'].value_counts()
final_df['statocivile'].fillna(0, inplace=True)

In [None]:
final_df['professione'].value_counts()
final_df['professione'].fillna(0, inplace=True)

In [None]:
final_df['origine'].value_counts()
final_df['origine'].isna().sum()
final_df['origine'].fillna(0, inplace=True)
final_df['origine'].isna().sum()

0

In [None]:
final_df['valore'].value_counts()
final_df['valore'].isna().sum()
final_df['valore'].fillna(0, inplace=True)

In [None]:
gender_mapping = {"M": 0, "F": 1}
final_df['sesso'] = final_df['sesso'].map(gender_mapping)

In [None]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,idcentro,idana,data,codiceamd,valore,sesso,scolarita,statocivile,professione,origine,label,eta
0,142,1,5,2013-10-01,AMD049,S,0,2.0,2.0,9.0,0.0,0,71
1,143,1,5,2013-10-01,AMD247,36.10,0,2.0,2.0,9.0,0.0,0,71
2,144,1,5,2013-10-01,AMD247,37.94,0,2.0,2.0,9.0,0.0,0,71
3,145,1,5,2013-10-01,AMD247,401,0,2.0,2.0,9.0,0.0,0,71
4,146,1,5,2013-10-01,AMD247,402,0,2.0,2.0,9.0,0.0,0,71


In [None]:
str_values = []
for index, row in final_df.iterrows():
    if (type(row['valore']) == str):
        try:
            float(row['valore'])
        except:
            str_values.append(row['valore'])

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
simple_encoder = LabelEncoder()
str_codes = simple_encoder.fit_transform(str_values)

In [None]:
encodes = np.array(str_codes).transpose()
original = np.array(str_values).transpose()

In [None]:
all_values = final_df['valore']
new_values = []
for v in all_values:
    if (type(v) == str):
        try:
            val = float(v)
            new_values.append(val)
        except:
            i = str_values.index(v)
            new_values.append(str_codes[i])
    else:
        new_values.append(v)
valore = np.array(new_values).transpose()
final_df['valore'] = valore

In [None]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,idcentro,idana,data,codiceamd,valore,sesso,scolarita,statocivile,professione,origine,label,eta
0,142,1,5,2013-10-01,AMD049,155.0,0,2.0,2.0,9.0,0.0,0,71
1,143,1,5,2013-10-01,AMD247,36.1,0,2.0,2.0,9.0,0.0,0,71
2,144,1,5,2013-10-01,AMD247,37.94,0,2.0,2.0,9.0,0.0,0,71
3,145,1,5,2013-10-01,AMD247,401.0,0,2.0,2.0,9.0,0.0,0,71
4,146,1,5,2013-10-01,AMD247,402.0,0,2.0,2.0,9.0,0.0,0,71


## 2.3 Call PubMedBERT

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [None]:
amd_codes_df = pd.read_csv(df_path + 'amd_codes_for_bert.csv')
amd_codes_df.head()

Unnamed: 0,codice,meaning
0,AMD090,Diet only
1,AMD140,Self control
2,AMD215,Number of strips prescribed per week
3,AMD228,Integrated management
4,AMD086,Self-monitoring of blood glucose


In [None]:
sbert_model = SentenceTransformer('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
document_embeddings = sbert_model.encode(amd_codes_df['meaning'])

Downloading (…)0f386/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)2ff30f386/LICENSE.md:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

Downloading (…)12ff30f386/README.md:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading (…)ff30f386/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)12ff30f386/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/microsoft_BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 2.4 PCA

In [None]:
pca = PCA(n_components=n_components)
transformed_data = pca.fit_transform(document_embeddings)
document_embeddings = transformed_data

In [None]:
# Fit the data to the PCA model and compute the transformed data
transformed_data = pca.fit_transform(document_embeddings)
document_embeddings = transformed_data.transpose()
for index_c, row in enumerate(document_embeddings):
    amd_codes_df['pubmedbert' + str(index_c)] = document_embeddings[index_c]
n_components = index_c+1

In [None]:
print("Embedding doe with " + str(n_components) + " features.")

Embedding doe with 1 features.


In [None]:
if save_df:
    amd_codes_df.to_csv(df_path + 'amd_pubmedbert.csv')
amd_codes_df.head()

Unnamed: 0,codice,meaning,pubmedbert0
0,AMD090,Diet only,2.045099
1,AMD140,Self control,-0.171683
2,AMD215,Number of strips prescribed per week,-1.428744
3,AMD228,Integrated management,0.813537
4,AMD086,Self-monitoring of blood glucose,-1.204597


## 2.5 Merge Embedding

Put the PubMedBERT embedding into final_df

In [None]:
for index_c, row in enumerate(document_embeddings):
    final_df['pubmedbert' + str(index_c)] = final_df['codiceamd'].map(amd_codes_df.set_index('codice')['pubmedbert' + str(index_c)])

In [None]:
#final_df.drop(columns=["codiceamd"], inplace=True)
if save_df:
    final_df.to_csv(df_path + "processed.csv")
final_df.head()

Unnamed: 0.1,Unnamed: 0,idcentro,idana,data,codiceamd,valore,sesso,scolarita,statocivile,professione,origine,label,eta,pubmedbert0
0,142,1,5,2013-10-01,AMD049,155.0,0,2.0,2.0,9.0,0.0,0,71,-0.276678
1,143,1,5,2013-10-01,AMD247,36.1,0,2.0,2.0,9.0,0.0,0,71,-1.283299
2,144,1,5,2013-10-01,AMD247,37.94,0,2.0,2.0,9.0,0.0,0,71,-1.283299
3,145,1,5,2013-10-01,AMD247,401.0,0,2.0,2.0,9.0,0.0,0,71,-1.283299
4,146,1,5,2013-10-01,AMD247,402.0,0,2.0,2.0,9.0,0.0,0,71,-1.283299


In [None]:
del final_df
del document_embeddings
del transformed_data
del amd_codes_df
del pca
del sbert_model

# 3. Define Sequences

## 3.1 Personal data

In [None]:
final_df = pd.read_csv(df_path + "processed.csv")
final_df = final_df.iloc[:,1:]
final_df.head()

Unnamed: 0.1,Unnamed: 0,idcentro,idana,data,codiceamd,valore,sesso,scolarita,statocivile,professione,origine,label,eta,pubmedbert0
0,142,1,5,2013-10-01,AMD049,155.0,0,2.0,2.0,9.0,0.0,0,71,-0.276678
1,143,1,5,2013-10-01,AMD247,36.1,0,2.0,2.0,9.0,0.0,0,71,-1.283299
2,144,1,5,2013-10-01,AMD247,37.94,0,2.0,2.0,9.0,0.0,0,71,-1.283299
3,145,1,5,2013-10-01,AMD247,401.0,0,2.0,2.0,9.0,0.0,0,71,-1.283299
4,146,1,5,2013-10-01,AMD247,402.0,0,2.0,2.0,9.0,0.0,0,71,-1.283299


Write anagrafic details into LABELED_PATIENTS

In [None]:
final_df.drop_duplicates(subset=['idcentro', 'idana'], inplace=True)
final_df.drop(columns=['data','valore', 'eta', 'codiceamd'], inplace=True)
for i in range(n_components):
    final_df.drop(columns=['pubmedbert'+str(i)], inplace=True)
final_df.head()

Unnamed: 0.1,Unnamed: 0,idcentro,idana,sesso,scolarita,statocivile,professione,origine,label
0,142,1,5,0,2.0,2.0,9.0,0.0,0
85,876,1,36,1,2.0,3.0,9.0,0.0,0
101,1098,1,38,1,2.0,3.0,9.0,0.0,0
110,1464,1,61,1,2.0,2.0,4.0,0.0,0
133,2133,1,65,0,2.0,1.0,9.0,0.0,0


In [None]:
if save_df:
    final_df.to_csv(df_path+"labeled_anagrafic.csv")
del final_df

## 3.2 Normal sequences

### 3.2.1 Generate sequences

In [None]:
labeled_patients = pd.read_csv(df_path+"labeled_anagrafic.csv")
labeled_patients = labeled_patients.iloc[:,1:]
final_df = pd.read_csv(df_path + "processed.csv")
final_df.drop(columns=['sesso','scolarita', 'statocivile', 'professione', 'label', 'origine'], inplace=True)
final_df = final_df.iloc[:,1:]

In [None]:
max_sequence_len = 0
min_sequence_len = float('inf')
sequence_matrix = []
seq_index = 0
only_main = False # if the sequence should oly include macro events
n_patients = labeled_patients.shape[0]

In [None]:
for index, row in labeled_patients.iterrows():
    print(str(index) + ' of ' + str(n_patients) + ' (' + str(int(index/n_patients)) + '%)')
    seq_index = 0
    sequence = []
    # add patient's data to sequence
    sequence.append(row['idcentro'])
    seq_index += 1
    sequence.append(row['idana'])
    seq_index += 1
    sequence.append(row['sesso'])
    seq_index += 1
    sequence.append(row['scolarita'])
    seq_index += 1
    sequence.append(row['statocivile'])
    seq_index += 1
    sequence.append(row['professione'])
    seq_index += 1
    sequence.append(row['origine'])
    seq_index += 1

    # build the sequence
    # get all events from patient
    c1 = final_df['idcentro'] == row['idcentro']
    c2 = final_df['idana'] == row['idana']
    if only_main:
        c3 = final_df['codiceamd'].isin(cardiovascular_events)
    else:
        c3 = True

    events = final_df[c1 & c2 & c3]

    # order by date
    events = events.sort_values('data', ascending=True)

    # write sequence
    for i, event in events.iterrows():
        # insert valuable data from event
            # patient AGE
        sequence.append(event['eta'])
        seq_index += 1

        # embedding of AMD CODE
        for j in range(n_components):
            sequence.append(event['pubmedbert'+str(j)])
            seq_index += 1

            # VALUE
        sequence.append(event['valore'])
        seq_index += 1

    # add to sequences
    sequence_matrix.append(sequence)
    #update sequence len
    #seq_index += 1
    if max_sequence_len < seq_index:
        max_sequence_len = seq_index
    if min_sequence_len > seq_index:
        min_sequence_len = seq_index

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
53045 of 58045 (0%)
53046 of 58045 (0%)
53047 of 58045 (0%)
53048 of 58045 (0%)
53049 of 58045 (0%)
53050 of 58045 (0%)
53051 of 58045 (0%)
53052 of 58045 (0%)
53053 of 58045 (0%)
53054 of 58045 (0%)
53055 of 58045 (0%)
53056 of 58045 (0%)
53057 of 58045 (0%)
53058 of 58045 (0%)
53059 of 58045 (0%)
53060 of 58045 (0%)
53061 of 58045 (0%)
53062 of 58045 (0%)
53063 of 58045 (0%)
53064 of 58045 (0%)
53065 of 58045 (0%)
53066 of 58045 (0%)
53067 of 58045 (0%)
53068 of 58045 (0%)
53069 of 58045 (0%)
53070 of 58045 (0%)
53071 of 58045 (0%)
53072 of 58045 (0%)
53073 of 58045 (0%)
53074 of 58045 (0%)
53075 of 58045 (0%)
53076 of 58045 (0%)
53077 of 58045 (0%)
53078 of 58045 (0%)
53079 of 58045 (0%)
53080 of 58045 (0%)
53081 of 58045 (0%)
53082 of 58045 (0%)
53083 of 58045 (0%)
53084 of 58045 (0%)
53085 of 58045 (0%)
53086 of 58045 (0%)
53087 of 58045 (0%)
53088 of 58045 (0%)
53089 of 58045 (0%)
53090 of 58045 (0%)
53091 of 58045 (

In [None]:
del final_df
del events
del c1
del c2
del c3

### 3.2.2 Process sequences

All sequences must have the same length, thus we will extend the short ones.

Sequences have no label.

In [None]:
max_sequence_len, min_sequence_len

(1585, 10)

In [None]:
for seq in range(len(sequence_matrix)):
    for new_element in range( max_sequence_len - len(sequence_matrix[seq]) ):
        sequence_matrix[seq].append(empty_element)

Add the label at the very end of each sequence

In [None]:
for seq in sequence_matrix:
    idcentro = seq[0]
    idana = seq[1]
    c1 = labeled_patients['idcentro'] == idcentro
    c2 = labeled_patients['idana'] == idana
    label = labeled_patients[c1 & c2].iloc[0,8]
    seq.append(label)

In [None]:
del labeled_patients
del c1
del c2
del seq

In [None]:
seq_df = pd.DataFrame(sequence_matrix)
seq_df.rename(columns={0:'idcentro', 1:'idana', max_sequence_len:'label'}, inplace=True)

In [None]:
seq_df.head()

Unnamed: 0,idcentro,idana,2,3,4,5,6,7,8,9,...,1576,1577,1578,1579,1580,1581,1582,1583,1584,label
0,1.0,5.0,0.0,2.0,2.0,9.0,0.0,71,1.223619,1.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,36.0,1.0,2.0,3.0,9.0,0.0,86,-1.568388,116.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.0,38.0,1.0,2.0,3.0,9.0,0.0,90,-0.802095,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,61.0,1.0,2.0,2.0,4.0,0.0,80,1.885222,140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,65.0,0.0,2.0,1.0,9.0,0.0,75,-1.568388,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
if save_df:
    seq_df.to_csv(df_path + "norm_sequences_df.csv")

In [None]:
del seq_df
del sequence_matrix

## 3.3 Time-Sensitive Sequences

### 3.3.1 Generate Sequences

In [None]:
labeled_patients = pd.read_csv(df_path+"labeled_anagrafic.labcsv")
labeled_patients = labeled_patients.iloc[:,1:]
final_df = pd.read_csv(df_path + "processed.csv")
final_df.drop(columns=['sesso','scolarita', 'statocivile', 'professione', 'label', 'origine'], inplace=True)
final_df = final_df.iloc[:,1:]

In [None]:
max_sequence_len = 0
min_sequence_len = float('inf')
sequence_matrix = []
seq_index = 0
only_main = False # if the sequence should oly include macro events
n_patients = labeled_patients.shape[0]

In [None]:
for index, row in labeled_patients.iterrows():
    print(str(index) + ' of ' + str(n_patients) + ' (' + str(int(index/n_patients)) + '%)')
    seq_index = 0
    sequence = []
    # add patient's data to sequence
    sequence.append(row['idcentro'])
    seq_index += 1
    sequence.append(row['idana'])
    seq_index += 1
    sequence.append(row['sesso'])
    seq_index += 1
    sequence.append(row['scolarita'])
    seq_index += 1
    sequence.append(row['statocivile'])
    seq_index += 1
    sequence.append(row['professione'])
    seq_index += 1
    sequence.append(row['origine'])
    seq_index += 1

    # build the sequence
    # get all events from patient
    c1 = final_df['idcentro'] == row['idcentro']
    c2 = final_df['idana'] == row['idana']
    if only_main:
        c3 = final_df['codiceamd'].isin(cardiovascular_events)
    else:
        c3 = True

    events = final_df[c1 & c2 & c3]

    # order by date
    events = events.sort_values('data', ascending=True)
    last_event = None

    # write sequence
    for i, event in events.iterrows():
        # check empty intervals
        if last_event is None:
            last_event = pd.to_datetime(event['data'])
        else:
            if (pd.Timestamp(event['data']) - pd.DateOffset(months=month_interval)) > last_event:
                intervals_passed = int( diff_month(last_event,pd.to_datetime(event['data'])) / month_interval)
                for k in range(intervals_passed): # how many intervals have passed
                     # add empty sequence
                    sequence.append(event['eta'])
                    seq_index += 1
                    for j in range(n_components):
                        sequence.append(0.0)
                        seq_index += 1
                    sequence.append(0)
                    seq_index += 1
            # update last event
            last_event = pd.to_datetime(event['data'])

        # insert valuable data from event
            # patient AGE
        sequence.append(event['eta'])
        seq_index += 1

        # embedding of AMD CODE
        for j in range(n_components):
            sequence.append(event['pubmedbert'+str(j)])
            seq_index += 1

            # VALUE
        sequence.append(event['valore'])
        seq_index += 1

    # add to sequences
    sequence_matrix.append(sequence)
    #update sequence len
    #seq_index += 1
    if max_sequence_len < seq_index:
        max_sequence_len = seq_index
    if min_sequence_len > seq_index:
        min_sequence_len = seq_index

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
43411 of 48411 (0%)
43412 of 48411 (0%)
43413 of 48411 (0%)
43414 of 48411 (0%)
43415 of 48411 (0%)
43416 of 48411 (0%)
43417 of 48411 (0%)
43418 of 48411 (0%)
43419 of 48411 (0%)
43420 of 48411 (0%)
43421 of 48411 (0%)
43422 of 48411 (0%)
43423 of 48411 (0%)
43424 of 48411 (0%)
43425 of 48411 (0%)
43426 of 48411 (0%)
43427 of 48411 (0%)
43428 of 48411 (0%)
43429 of 48411 (0%)
43430 of 48411 (0%)
43431 of 48411 (0%)
43432 of 48411 (0%)
43433 of 48411 (0%)
43434 of 48411 (0%)
43435 of 48411 (0%)
43436 of 48411 (0%)
43437 of 48411 (0%)
43438 of 48411 (0%)
43439 of 48411 (0%)
43440 of 48411 (0%)
43441 of 48411 (0%)
43442 of 48411 (0%)
43443 of 48411 (0%)
43444 of 48411 (0%)
43445 of 48411 (0%)
43446 of 48411 (0%)
43447 of 48411 (0%)
43448 of 48411 (0%)
43449 of 48411 (0%)
43450 of 48411 (0%)
43451 of 48411 (0%)
43452 of 48411 (0%)
43453 of 48411 (0%)
43454 of 48411 (0%)
43455 of 48411 (0%)
43456 of 48411 (0%)
43457 of 48411 (

In [None]:
del final_df
del events
del c1
del c2
del c3

### 3.3.1 Process Sequences

In [None]:
max_sequence_len, min_sequence_len

(1585, 7)

In [None]:
for seq in range(len(sequence_matrix)):
    for new_element in range( max_sequence_len - len(sequence_matrix[seq]) ):
        sequence_matrix[seq].append(empty_element)

In [None]:
for seq in sequence_matrix:
    idcentro = seq[0]
    idana = seq[1]
    c1 = labeled_patients['idcentro'] == idcentro
    c2 = labeled_patients['idana'] == idana
    label = labeled_patients[c1 & c2].iloc[0,7]
    seq.append(label)

In [None]:
del labeled_patients
del c1
del c2
del seq

In [None]:
seq_df = pd.DataFrame(sequence_matrix)
seq_df.rename(columns={0:'idcentro', 1:'idana', max_sequence_len:'label'}, inplace=True)

In [None]:
seq_df.head()

Unnamed: 0,idcentro,idana,2,3,4,5,6,7,8,9,...,1576,1577,1578,1579,1580,1581,1582,1583,1584,label
0,1.0,5.0,0.0,2.0,2.0,9.0,0.0,71.0,1.223619,1.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,36.0,1.0,2.0,3.0,9.0,0.0,86.0,-1.568388,116.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.0,38.0,1.0,2.0,3.0,9.0,0.0,90.0,-0.802095,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,61.0,1.0,2.0,2.0,4.0,0.0,80.0,1.885222,140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,65.0,0.0,2.0,1.0,9.0,0.0,75.0,-1.568388,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
if save_df:
    seq_df.to_csv(df_path + "time_sequences_df.csv")

In [None]:
del seq_df
del sequence_matrix

# 4. SMOTE

## 4.1 Normal Sequences

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
final_df = pd.read_csv(df_path + "norm_sequences_df.csv")
final_df = final_df.iloc[:,1:]

In [None]:
final_df.head()

Unnamed: 0,idcentro,idana,2,3,4,5,6,7,8,9,...,1576,1577,1578,1579,1580,1581,1582,1583,1584,label
0,1.0,5.0,0.0,2.0,2.0,9.0,0.0,71,1.223619,1.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,36.0,1.0,2.0,3.0,9.0,0.0,86,-1.568388,116.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,1.0,38.0,1.0,2.0,3.0,9.0,0.0,90,-0.802095,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1.0,61.0,1.0,2.0,2.0,4.0,0.0,80,1.885222,140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1.0,65.0,0.0,2.0,1.0,9.0,0.0,75,-1.568388,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


Separate the features and the labels

In [None]:
y = final_df['label']
X = final_df.drop(columns=['label'])
del final_df

Apply SMOTE to oversample the minority class

In [None]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
y_resampled.value_counts()

0    43926
1    43926
Name: label, dtype: int64

In [None]:
X_resampled.head()

Unnamed: 0,idcentro,idana,2,3,4,5,6,7,8,9,...,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584
0,1.0,5.0,0.0,2.0,2.0,9.0,0.0,71,1.223619,1.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,36.0,1.0,2.0,3.0,9.0,0.0,86,-1.568388,116.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,38.0,1.0,2.0,3.0,9.0,0.0,90,-0.802095,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,61.0,1.0,2.0,2.0,4.0,0.0,80,1.885222,140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,65.0,0.0,2.0,1.0,9.0,0.0,75,-1.568388,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
if save_df:
    final_df = X_resampled.iloc[:,:]
    final_df['label'] = y_resampled
    final_df.to_csv(df_path + "norm_smote_seq.csv")
    del final_df

  final_df['label'] = y_resampled


In [None]:
del X_resampled
del y_resampled
del X
del y
del smote

## 4.2 Time-Sensitive Sequences

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
final_df = pd.read_csv(df_path + "time_sequences_df.csv")
final_df = final_df.iloc[:,1:]

In [None]:
y = final_df['label']
X = final_df.drop(columns=['label'])
del final_df

In [None]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
y_resampled.value_counts()

0    43846
1    43846
Name: label, dtype: int64

In [None]:
X_resampled.head()

Unnamed: 0,idcentro,idana,2,3,4,5,6,7,8,9,...,1575,1576,1577,1578,1579,1580,1581,1582,1583,1584
0,1.0,5.0,0.0,2.0,2.0,9.0,0.0,71.0,1.223619,1.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,36.0,1.0,2.0,3.0,9.0,0.0,86.0,-1.568388,116.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,38.0,1.0,2.0,3.0,9.0,0.0,90.0,-0.802095,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,61.0,1.0,2.0,2.0,4.0,0.0,80.0,1.885222,140.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,65.0,0.0,2.0,1.0,9.0,0.0,75.0,-1.568388,92.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
if save_df:
    final_df = X_resampled.iloc[:,:]
    final_df['label'] = y_resampled
    final_df.to_csv(df_path + "time_smote_seq.csv")
    del final_df

  final_df['label'] = y_resampled


In [None]:
del X_resampled
del y_resampled
del X
del y
del smote

# 5. Train Model

In [None]:
!pip install tensorflow

In [13]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

Model Hyper-Parameters

In [14]:
epochs = 10
batch_size = 32
loss = "sparse_categorical_crossentropy"
optimizer = "adam"
metrics = ["sparse_categorical_crossentropy"]
activation = "softmax"
test_size = 0.2
random_state = 42
neurons = 32

## 5.1 LSTM

### 5.1.1 Train

Load data

In [16]:
sequences = pd.read_csv(df_path + "norm_smote_seq.csv")
sequences = sequences.iloc[:,1:]
features = sequences.drop(columns = ["label"])
label = sequences["label"]
del sequences

Calculates normalization parameters (mean and standard deviation) on training features, Transforms training and test features according to these parameters.

In [17]:
scaler = StandardScaler()
features[features.columns[:]] = scaler.fit_transform(features[features.columns[:]])

20% of the data will be used as a test set, where X are sets of features and y are sets of label

In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_size, random_state=random_state)

In [19]:
del features
del label

transform the training and test data into a structure that can be used as input for an LSTM neural network model

In [20]:
X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
print("Sequences in test set:", X_test.shape[0])
print("Sequences in train set:", X_train.shape[0])

Sequences in test set: 17571
Sequences in train set: 70281


In [21]:
model = Sequential()
model.add(LSTM(neurons, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(2, activation=activation))

In [22]:
model.compile(loss=loss, optimizer=keras.optimizers.Adam(learning_rate=1e-4), metrics=metrics)

In [None]:
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save(df_path + 'Model_LSTM.h5')

In [None]:
del model
del history

### 5.1.2 Test

In [23]:
lstm_model = load_model(df_path+'Model_LSTM.h5')

In [24]:
y_pred = lstm_model.predict(X_test)



In [25]:
y_pred = y_pred.argmax(axis=-1)

In [26]:
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 69.15%


In [27]:
f_score = f1_score(y_test, y_pred)
print("F-Score: {:.5f}".format(f_score))

F-Score: 0.84895


In [28]:
'''
[[true negative, false positive]
[false negative, true positive]]
'''
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[8354  426]
 [1993 6798]]


## 5.2 T-LSTM

### 5.2.1 Train

load data

In [None]:
sequences = pd.read_csv(df_path + "time_smote_seq.csv")
sequences = sequences.iloc[:,1:]
features = sequences.drop(columns = ["label"])
label = sequences["label"]

In [None]:
del sequences

scale

In [None]:
scaler = StandardScaler()
features[features.columns[:]] = scaler.fit_transform(features[features.columns[:]])

split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_size, random_state=random_state)

In [None]:
X_train = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
print("Sequences in test set:", X_test.shape[0])
print("Sequences in train set:", X_train.shape[0])

In [None]:
del features
del label

Model

In [None]:
model = Sequential()
model.add(LSTM(neurons, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(2, activation=activation))

In [None]:
model.compile(loss=loss, optimizer=keras.optimizers.Adam(learning_rate=1e-4), metrics=metrics)

In [None]:
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save(df_path + 'Model_T-LSTM.h5')

In [None]:
del model
del history

### 5.2.2 Test

In [29]:
model = load_model(df_path+'Model_T-LSTM.h5')

In [30]:
y_pred = model.predict(X_test)



In [31]:
y_pred = y_pred.argmax(axis=-1)

In [32]:
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 52.12%


In [33]:
f_score = f1_score(y_test, y_pred)
print("F-Score: {:.2f}".format(f_score))

F-Score: 0.73


In [34]:
'''
[[true negative, false positive]
[false negative, true positive]]
'''
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[6594 2186]
 [2439 6352]]
