# Label resolver development

In [1]:
import csv
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from scipy import sparse as sp_sparse
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC

In [2]:
word2label_file = '/home/anjani/distant-PICO/CandidateGeneration/ResultInspection/resolve_annot_corpus_temp.tsv'

In [3]:
total_words = []
with open(word2label_file, 'r') as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        #print(row[1:])
        removeSpecialChars = re.sub("[!@#$%^&*()[]{};:,./<>?\|`~-=_+]", " ", row[1])
        total_words.append( removeSpecialChars.lower() )

In [4]:
len(set(total_words))

77709

In [5]:
data = pd.read_csv(word2label_file, sep='\t', error_bad_lines=False, header=None)
print('Shape of the data: ', data.shape)
data.head()

Shape of the data:  (286890, 3)




  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,0,1,2
0,0,Infant,1
1,1,Child,1
2,2,Retinoblastoma,1
3,3,25,1
4,4,carboplatin,2


In [6]:
data[2].head()

0    1
1    1
2    1
3    1
4    2
Name: 2, dtype: int64

In [7]:
X, X_test, y, y_test = train_test_split(data[1], data[2], test_size=0.2, train_size=0.8, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size = 0.25,train_size =0.75)

print(X_train[:1])
print(y_train[:1])

print(X_val[:1])
print(y_val[:1])

192527    30 Second Chair Stand Test
Name: 1, dtype: object
192527    3
Name: 2, dtype: int64
56068    Hyperbaric oxygen treatment
Name: 1, dtype: object
56068    2
Name: 2, dtype: int64


In [8]:
REPLACE_BY_SPACE_RE = re.compile('[!@#$%^&*()[]{};:,./<>?\|`~=_+]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = REPLACE_BY_SPACE_RE.sub(' ', str(text))# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = ' '.join([w for w in text.split() if not w in STOPWORDS])# delete stopwords from text
    return text.lower()

X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_train[:3]

['30 second chair stand test', 'tapinarof cream, 1%', 'child discomfort']

In [9]:
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for comments in X_train:
    for word in comments.split():
        if word not in words_counts:
            words_counts[word] = 1
        words_counts[word] += 1
        
DICT_SIZE = 30000
POPULAR_WORDS = sorted(words_counts, key=words_counts.get, reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {key: rank for rank, key in enumerate(POPULAR_WORDS, 0)}
INDEX_TO_WORDS = {index:word for word, index in WORDS_TO_INDEX.items()}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [10]:
def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
print('X_train shape ', X_train_mybag.shape, '\nX_val shape ', X_val_mybag.shape)

X_train shape  (172134, 30000) 
X_val shape  (57378, 30000)


In [11]:
POPULAR_WORDS[:10]

['adult',
 'female',
 'male',
 'randomized',
 'women',
 'years',
 'randomized,',
 'placebo',
 'child',
 'cancer']

In [12]:
def train_classifier(X_train, y_train, C, regularisation):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    model = OneVsRestClassifier(LogisticRegression(penalty=regularisation, C=C, max_iter=10000)).fit(X_train, y_train)
    return model

classifier_mybag = train_classifier(X_train_mybag, y_train, C = 4, regularisation = 'l2')

y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_labels_mybag_proba = classifier_mybag.predict_proba(X_val_mybag)

In [13]:
#classifier_mybag = train_classifier(X_train_mybag, y_train, C = 50, regularisation = 'l2')

#y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)

In [14]:
def print_evaluation_scores(y_test, predicted):
    
    print('Accuracy: ', accuracy_score(y_test, predicted, normalize=True))
    print('F1-score macro: ', f1_score(y_test, predicted, average='macro'))
    print('F1-score micro: ', f1_score(y_test, predicted, average='micro'))
    print('F1-score weighted: ', f1_score(y_test, predicted, average='weighted'))
    print(metrics.classification_report(y_test, predicted))
    
print('Bag-of-words\n')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)

Bag-of-words

Accuracy:  0.9310362856844087
F1-score macro:  0.9336751877555489
F1-score micro:  0.9310362856844087
F1-score weighted:  0.9308860345502408
              precision    recall  f1-score   support

           1       0.96      0.95      0.95     27481
           2       0.89      0.94      0.91     14919
           3       0.90      0.84      0.87     10932
           4       1.00      0.99      1.00      4046

    accuracy                           0.93     57378
   macro avg       0.94      0.93      0.93     57378
weighted avg       0.93      0.93      0.93     57378



In [15]:
import pickle

In [16]:
s = pickle.dumps(classifier_mybag)