# Text Classification

In [123]:
#Imports

import pandas as pd
import numpy as np
import re
from string import punctuation, whitespace
import nltk
from nltk.corpus import stopwords
from pattern.it import parse, split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [124]:
#Load clinical diaries
df =pd.read_excel('C:/Users/andrea.foroni/Downloads/DATI PS 2020-CON NOTE DIARIO CLINICO.xlsx')#,usecols="A:C,E:F")
df = df.rename(columns={"MEDICO" : "AUTORE"})

df =df.append(pd.read_excel('C:/Users/andrea.foroni/Downloads/DATI PS 2019-CON NOTE DIARIO CLINICO.xlsx'), ignore_index=True)

df.columns = map(str.lower, df.columns)

In [125]:
df.head()

Unnamed: 0,numero_pratica,tipo_nota,data_inserimento,testo,autore
0,PS2019071726,Nota clinica,"04/12/2019 12:21:50,759000",si esegue rx endorale che evidenzia la presenz...,DR. TAROZZI MARCO
1,PS2020000210,Nota clinica,"02/01/2020 09:52:44,668000",Previo consenso informato e previa anestesia p...,DR. FERRANDO CESARE
2,PS2020000211,Nota clinica,"02/01/2020 09:30:43,247000","presa visione della opt portata dal pz, previo...",DR. FERRANDO CESARE
3,PS2020000214,Nota clinica,"02/01/2020 11:00:29,157000",Si esegue rx endorale che conferma presenza di...,DR. FERRANDO CESARE
4,PS2020000216,Nota clinica,"02/01/2020 09:31:38,871000",In accordo con il pz si procede alla ricementa...,DR. FERRANDO CESARE


In [126]:
df.shape

(8383, 5)

In [127]:
df.dtypes

numero_pratica      object
tipo_nota           object
data_inserimento    object
testo               object
autore              object
dtype: object

In [128]:
#change datatypes
df['testo']= df['testo'].astype(str)
df['numero_pratica']= df['numero_pratica'].astype(str)

In [129]:
#
df = df.groupby(['numero_pratica'], as_index = False).agg({'testo': ' '.join})

df = df[df['numero_pratica'] !='NUMERO_PRATICA']

In [130]:
df['numero_pratica'].value_counts()

PS2019020546    1
PS2019065962    1
PS2019059917    1
PS2019045544    1
PS2020003411    1
               ..
PS2019054052    1
PS2019058620    1
PS2019035413    1
PS2019029334    1
PS2020015980    1
Name: numero_pratica, Length: 8163, dtype: int64

## 1. Text cleaning and preparation

### 1.1. Special character and punctuation signs cleaning

In [131]:
#Special characters (more than in punctuation list)
df['testo_clean'] = df['testo'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x) )

In [132]:
#Whitespaces cleaning

whites = list(set(whitespace)-{' '})
for w in whites:
    df['testo_clean']= df['testo_clean'].str.replace(w,' ')

### 1.2. Upcase/downcase

In [133]:
df['testo_clean'] = df['testo_clean'].str.lower()

### 1.3. Stop words

In [134]:
#Import stopwords
stop = stopwords.words('italian')

for s in stop:
    df['testo_clean']= df['testo_clean'].str.replace(r"\b" + s + r"\b",' ')

  """


### 1.4. Remove double spaces

In [135]:
#Remove double spaces
df['testo_clean'] = df['testo_clean'].apply(lambda x: re.sub(' +', ' ', x) )

### 1.5. Lemmatization

I chose to apply lemmatization only as I did not want to produce words that do not exist.

In [136]:
#lemmatize word function
def lemmatize_word(input_word):
    in_word = input_word#.decode('utf-8')
    # print('Something: {}'.format(in_word))
    word_it = parse(
        in_word,
        tokenize=False,
        tag=False,
        chunk=False,
        lemmata=True
    )
    # print("Input: {} Output: {}".format(in_word, word_it))
    the_lemmatized_word = word_it.split()[0][0][4]
    # print("Returning: {}".format(the_lemmatized_word))
    return the_lemmatized_word

#tokenize sentence (string) 
def tokenize(sentence_totoken):
    return nltk.tokenize.word_tokenize(sentence_totoken)
    
#tokenize and lemmatize sentences and return string
def lemmatize_sentence(sentence):
    lemmatized = []
    for word in tokenize(sentence):
        lemmatized.append(lemmatize_word(word))
    lemmatized_text = " ".join(lemmatized)
    return lemmatized_text
    

In [137]:
df['testo_clean'] = df['testo_clean'].apply(lambda x : lemmatize_sentence(x) )

In [138]:
df.head()

Unnamed: 0,numero_pratica,testo,testo_clean
1,PS2019000196,Si esegue Rx endorale da cui si evidenzia cari...,eseguire rx endorala evidenziare carie mesiala...
2,PS2019000197,si esegue rx endorale di controllo. Si spiega ...,eseguire rx endorala controllo spiegare pazien...
3,PS2019000198,eseguita rx endorale che conferma quadro clini...,eseguitare rx endorala conferma quadro clinico...
4,PS2019000199,Eseguita rx endorale che evidenzia estensione ...,eseguitare rx endorala evidenziare estensione ...
5,PS2019000200,Si eseguono rx endorali che non evidenziano ri...,eseguire rx endorale evidenziare rima frattura...


In [139]:
#Load categorized and uncategorized 
df_classified =pd.read_excel('C:/Users/andrea.foroni/Downloads/Odontoiatria PS 2019-2020_21-02_05-05.xlsx',sheet_name= 'Data Base')
df_classified.columns = map(str.lower, df_classified.columns)

df_classified.head()

db_originale=pd.read_excel('C:/Users/andrea.foroni/Downloads/Odontoiatria PS 2019-2020_21-02_05-05.xlsx',sheet_name= 'Data Base')
db_originale.columns = map(str.lower, db_originale.columns)

In [140]:
df_classified.shape

(3416, 21)

In [141]:
#Select only relevant columns
df_classified= df_classified[['anno', 'mese', 'data_accettazione', 'modalita_dimissione', 'numero_sdo_ricetta', 'eta_pz', 'adulti-bambini', 'data_ora_ingresso_ps', 'data_ora_uscita_ps', 'diagnosi_principale', 'diagnosi_1', 'pz_raggruppamento_residenza']]

In [142]:
#Remove duplicates and reindex
df_classified = df_classified.drop_duplicates().reset_index(drop=True)

In [143]:
#Load aggregated diagnosis
diagnosi_aggr=pd.read_excel('C:/Users/andrea.foroni/Downloads/diagnosi_aggr.xlsx')
diagnosi_aggr.columns = map(str.lower, diagnosi_aggr.columns)

diagnosi_aggr.head()

Unnamed: 0,diagnosi_principale,motivo_accesso
0,V722 - VISITA ODONTOIATRICA,VISITA
1,V6759 - ALTRA VISITA DI CONTROLLO,VISITA
2,V523 - COLLOCAZIONE E SISTEMAZIONE DI PROTESI ...,ALTRO
3,5206 - DISTURBI DELLERUZIONE DEL DENTE,ALTRO
4,"52879 - ALTRI DISTURBI DELLEPITELIO ORALE, INC...",ALTRO


In [144]:
#Join full database with aggregated diagnosis

df_classified = df_classified.merge(diagnosi_aggr,how='left',left_on='diagnosi_principale', right_on='diagnosi_principale')
df_classified.shape

(2252, 13)

In [145]:
#Join full database with clinical diary

df_classified = df_classified.merge(df,how='left',left_on='numero_sdo_ricetta', right_on='numero_pratica')


In [146]:
df_classified.head()

Unnamed: 0,anno,mese,data_accettazione,modalita_dimissione,numero_sdo_ricetta,eta_pz,adulti-bambini,data_ora_ingresso_ps,data_ora_uscita_ps,diagnosi_principale,diagnosi_1,pz_raggruppamento_residenza,motivo_accesso,numero_pratica,testo,testo_clean
0,2019,2,2019-02-21,1 - Dimissione ordinaria al domicilio del pazi...,PS2019011263,24,ADULTO,2019-02-21 08:52:58,2019-02-21 10:00:00,5253 - RADICE DENTARIA RITENUTA,5253 - RADICE DENTARIA RITENUTA,1 - IN ASL,ALTRO,PS2019011263,pz in ps per algia I quadrante.All'eo evidenzi...,pz ps algia quadrante eo evidenziare residuo r...
1,2019,2,2019-02-21,1 - Dimissione ordinaria al domicilio del pazi...,PS2019011267,44,ADULTO,2019-02-21 08:59:37,2019-02-21 09:23:00,V722 - VISITA ODONTOIATRICA,V722 - VISITA ODONTOIATRICA,1 - IN ASL,VISITA,PS2019011267,Consultata opt eseguita durante accesso in ps ...,consultatare opt eseguitare durante accesso ps...
2,2019,2,2019-02-21,1 - Dimissione ordinaria al domicilio del pazi...,PS2019011268,28,ADULTO,2019-02-21 09:02:33,2019-02-21 10:02:00,52102 - CARIE DENTALE ESTESA ALLA DENTINA,52102 - CARIE DENTALE ESTESA ALLA DENTINA,1 - IN ASL,CARIE,PS2019011268,eseguita OPT dalla quale si evidenzia vicinanz...,eseguitare opt evidenziare vicinanza nare rima...
3,2019,2,2019-02-21,1 - Dimissione ordinaria al domicilio del pazi...,PS2019011281,32,ADULTO,2019-02-21 09:37:42,2019-02-21 11:17:00,5253 - RADICE DENTARIA RITENUTA,5253 - RADICE DENTARIA RITENUTA,2 - IN REGIONE,ALTRO,PS2019011281,Eseguita rx endorale che conferma diagnosi cli...,eseguitare rx endorala conferma diagnosi clini...
4,2019,2,2019-02-21,1 - Dimissione ordinaria al domicilio del pazi...,PS2019011283,66,ADULTO,2019-02-21 09:53:29,2019-02-21 10:44:00,52109 - CARIE DENTALE,52109 - CARIE DENTALE,1 - IN ASL,CARIE,PS2019011283,eseguita rx endorale II q. si evidenzia lesion...,eseguitare rx endorala ii q evidenziare lesion...


In [147]:
df_classified.dtypes

anno                                    int64
mese                                    int64
data_accettazione              datetime64[ns]
modalita_dimissione                    object
numero_sdo_ricetta                     object
eta_pz                                  int64
adulti-bambini                         object
data_ora_ingresso_ps           datetime64[ns]
data_ora_uscita_ps             datetime64[ns]
diagnosi_principale                    object
diagnosi_1                             object
pz_raggruppamento_residenza            object
motivo_accesso                         object
numero_pratica                         object
testo                                  object
testo_clean                            object
dtype: object

In [148]:
df_classified.shape

(2252, 16)

In [149]:
df_classified['diagnosi_principale'].value_counts()

V722 - VISITA ODONTOIATRICA                       501
52100 - CARIE DENTALE NON SPECIFICATA             249
5226 - PERIODONTITE CRONICA APICALE               184
52109 - CARIE DENTALE                             157
68100 - FLEMMONE E ASCESSO,NON SPECIFICATO        156
                                                 ... 
3501 - NEVRALGIA DEL TRIGEMINO                      1
72885 - CONTRATTURA MUSCOLARE                       1
52104 - CARIE DENTALE ARRESTATA                     1
52332 - PERIODONTITE AGGRESSIVA, GENERALIZZATA      1
118 - MICOSI DA PATOGENI FACOLTATIVI                1
Name: diagnosi_principale, Length: 86, dtype: int64

In [150]:
df_classified['motivo_accesso'].value_counts(normalize=True)

CARIE             0.279301
VISITA            0.237713
ASCESSO           0.210302
PARODONTOPATIE    0.175803
ALTRO             0.058601
TRAUMA            0.035917
ATM               0.002363
Name: motivo_accesso, dtype: float64

CARIE è il motivo di accesso principale, pari al 27%. Baseline classifier che assegna sempre CARIE come motivo di accesso a PS è accurato al 27%


## 2. Encode Labels

In [151]:
accesso_codes = {
    'ALTRO': 0,
    'ASCESSO': 1,
    'ATM': 2,
    'CARIE': 3,
    'PARODONTOPATIE': 4,
    'TRAUMA':5
}

In [152]:
# Category mapping
df_classified['accesso_code'] = df_classified['motivo_accesso']
df_classified = df_classified.replace({'accesso_code':accesso_codes})

In [153]:
# Create file with uncategorized data
df_toclassify = df_classified[np.logical_or(df_classified['accesso_code']=='VISITA' , df_classified['accesso_code'].isnull())]

df_toclassify.to_csv('C:/Users/andrea.foroni/Downloads/visite.csv', index=False)

In [154]:
#remove visita

df_classified = df_classified[df_classified['accesso_code']!='VISITA']
df_classified = df_classified[df_classified['accesso_code'].notnull()]

In [155]:
df_classified.shape

(1613, 17)

In [156]:
df_classified.dtypes

anno                                    int64
mese                                    int64
data_accettazione              datetime64[ns]
modalita_dimissione                    object
numero_sdo_ricetta                     object
eta_pz                                  int64
adulti-bambini                         object
data_ora_ingresso_ps           datetime64[ns]
data_ora_uscita_ps             datetime64[ns]
diagnosi_principale                    object
diagnosi_1                             object
pz_raggruppamento_residenza            object
motivo_accesso                         object
numero_pratica                         object
testo                                  object
testo_clean                            object
accesso_code                           object
dtype: object

## 3. Train - test split
We'll set apart a test set to prove the quality of our models. We'll do Cross Validation in the train set in order to tune the hyperparameters and then test performance on the unseen data of the test set.

In [162]:
X_train, X_test, y_train, y_test = train_test_split(df_classified['testo_clean'], 
                                                    df_classified['accesso_code'].astype('int64'), 
                                                    test_size=0.15, 
                                                    random_state=8)

In [163]:
# Parameter election
ngram_range = (1,2)
min_df = 0.01
max_df = 1.
max_features = 10000

In [164]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train.values.astype('U')).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test.values.astype('U')).toarray()
labels_test = y_test
print(features_test.shape)

(1371, 741)
(242, 741)


In [165]:
from sklearn.feature_selection import chi2

for motivo_accesso, accesso_code in sorted(accesso_codes.items()):
    features_chi2 = chi2(features_train, labels_train == accesso_code)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(motivo_accesso))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'ALTRO' category:
  . Most correlated unigrams:
. clorexidina
. visita
. orala
. rivalutazione
. corona
  . Most correlated bigrams:
. presso reparto
. sciacquo clorexidina

# 'ASCESSO' category:
  . Most correlated unigrams:
. ascessuala
. ascesso
. ml
. drenaggio
. ceftriaxona
  . Most correlated bigrams:
. lesione radiotrasparente
. lavaggio ipoclorire

# 'ATM' category:
  . Most correlated unigrams:
. altro
. clinico
. quadro
. 18
. assumere
  . Most correlated bigrams:
. quadro clinico
. pz riferire

# 'CARIE' category:
  . Most correlated unigrams:
. cariosa
. conservativa
. destruente
. 75
. carie
  . Most correlated bigrams:
. carie destruente
. evidenziare carie

# 'PARODONTOPATIE' category:
  . Most correlated unigrams:
. parodontala
. perdita
. riassorbimento
. supporto
. osseo
  . Most correlated bigrams:
. perdita supporto
. riassorbimento osseo

# 'TRAUMA' category:
  . Most correlated unigrams:
. palatala
. valutare
. frammento
. coronala
. frattura
  . Most correlated

In [166]:
# X_train
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/df.pickle', 'wb') as output:
    pickle.dump(df_classified, output)
    
# features_train
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('C:/Users/andrea.foroni/Documents/myprojects/NLP/Pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)