/*
Welcome to this notebook! Here we extract IC50/Ki values and their corresponding categories from the text, and combine them with the extracted genes to output a tsv file with extracted information for each id.
*/

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import collections
import re
import json

In [2]:
simple_home = '/Users/Arushi/Desktop/twoXAR/deep-learn-bio-nlp-master'
file = open('{0}/simple_eval.eval'.format(simple_home), 'r')
gene_output = file.readlines()

In [3]:
df = pd.read_csv("parsed_mce_simple.tsv", sep="\t", header = None)

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [w for sent in sentences for w in sent if not w in stop_words]
    sentences = [w for w in sentences if not w in string.punctuation]
    sentences = nltk.pos_tag(sentences)
    return sentences

In [6]:
preprocess_list = list(df[2])

In [7]:
'''
Cleaning the data. The regex expression catches strings without oxford commas to prevent averaging between two dosage 
values. In addition, slashes are replaced with commas for cleaner separation, and the special character µ becomes u.
'''
preprocessed = []
p = re.compile(r'([\w\d\.]+)((?:,\s*[\w\d\.]+)+)\s+(and|or)\s+([\w\d\.]+)')
for sentence in preprocess_list:
    list_all = [x.group(0) for x in re.finditer(p, sentence)]
    for match in list_all:
        sentence = re.sub(match, re.sub(r' and',r', and', match), sentence)
    sentence = re.sub('/',',', sentence)
    sentence = re.sub(u'\u03BC', 'u', sentence)
    preprocessed.append(ie_preprocess(sentence))

In [8]:
'''
Use NLTK pos tagging to identify the category of each dosage value.
'''
total = []
for sentence in preprocessed:
    #find the tuple(s) with CD (numerical values) 
    #get next NN for each (order of mag. units) and find closest NNP beforehand (IC50 or Ki)
    ic50_vals = []
    for index in range(len(sentence)):
        word, label = sentence[index]
        if label == 'CD':
            #find next NN
            for next_ind in range(index + 1, len(sentence)):
                try:
                    word2, label2 = sentence[next_ind]
                    if label2 in ['NN', 'RB', 'NNS', 'JJ']:
                        #find closest NNP value
                        for prev_ind in range(index, -1, -1):
                            word3, label3 = sentence[prev_ind]
                            if label3 in ['NNP']:
                                ic50_vals.append((word3, word, word2))
                                break
                        break
                except IndexError:
                    continue
    total.append(ic50_vals)

In [9]:
'''
Corresponds ids to each extraction from above.
'''
ids = df[0]
total_ = {}
for index in range(len(ids)):
    total_.update({ids[index] : total[index]})

In [10]:
'''
Removes all occurrences with a category not in the specified list. This part could definitely be improved, since it's
probably that there are some entries such that the dosage is correct but the categorical value found is incorrect.
'''
cleaned = {}
for key, value in total_.items():
    new_split = []
    for item in value:
        val, unit, dim = item
        if(val in 'IC50 IC50s EC50 EC50s Ki Kis Kd Kds'):
            new_split.append(item)
    cleaned.update({key : new_split})

In [11]:
'''
Gets all gene names from the output file produced in deep-learn-bio-nlp-master/BC2 Gene Mention Example.ipynb.
'''
d = collections.defaultdict(list)
for val in gene_output:
    l = val.split('"')
    l[0] = l[0].strip()
    l[1] = l[1].split("|")[2].strip()
    d[l[0]].append(l[1])

In [12]:
'''
This step could also be improved. Here we basically correspond each gene name to the IC50/Ki value by going in order
until one or the other has been exhausted. It's likely that some repeated values are being included and others are 
being excluded.
'''
rows_list = []
for id_, genes in d.items():
    for i in range(len(genes)):
        amounts = cleaned[id_]
        try:
            row = {}
            row.update({'id':id_, 'gene':genes[i], 'category':amounts[i][0], 'value':amounts[i][1] + amounts[i][2]})
            rows_list.append(row)
        except (ValueError, IndexError): 
            continue

In [13]:
dataframe = pd.DataFrame(rows_list)

In [14]:
dataframe.to_csv("output.tsv", sep = '\t', index = False)