# Data Preparation

becareful editing this file since this will be the notebook for initial data preparation for all the models

# Setup

In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv

import tensorflow as tf
import gensim

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext autoreload

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

# Data

In [5]:
train_variants_df = pd.read_csv("dataset/stage1/training_variants")
test_variants_df = pd.read_csv("dataset/stage1/test_variants")

train_text_df = pd.read_csv("dataset/stage1/training_text", sep="\|\|", engine='python', 
                            header=None, skiprows=1, names=["ID","Text"])

test_text_df = pd.read_csv("dataset/stage1/test_text", sep="\|\|", engine='python', 
                           header=None, skiprows=1, names=["ID","Text"])
print("Train and Test variants shape : ",train_variants_df.shape, test_variants_df.shape)
print("Train and Test text shape : ",train_text_df.shape, test_text_df.shape)

Train and Test variants shape :  (3321, 4) (5668, 3)
Train and Test text shape :  (3321, 2) (5668, 2)


In [6]:
train_df = pd.merge(train_variants_df, train_text_df, how='left', on='ID')
train_df.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [7]:
test_df = pd.merge(test_variants_df, test_text_df, how='left', on='ID')
test_df.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [8]:
import missingno as msno
missing_val_cols = train_variants_df.columns[train_variants_df.isnull().any()].tolist()
sorted(missing_val_cols)
if len(missing_val_cols) != 0:
    msno.bar(train_variants_df[missing_val_cols],figsize=(20,8),fontsize=12,labels=True,)
else:
    print("NO MISSING DATA ..... dream come true :)")

NO MISSING DATA ..... dream come true :)


# Text Processing

## pre processing data frames

all processing is done by fasttext, you just sit and enjoy

create a text file with all the text data

In [9]:
%autoreload

# used to identify the words inside braces for identifying some unwanted strings
# from nltk.tokenize import SExprTokenizer

# used to make sentences
from nltk.tokenize import PunktSentenceTokenizer

#used to collect words in the sentences
from utils import custom_word_tokenizer, apply_custom_regx
from nltk import sent_tokenize, word_tokenize

## cleaning characters

![ascii-cheat-sheet](imgs/ascii-cheat-sheet.png)

In [10]:
#using just printable characters
undesirable_ascii_characters = list(range(32))
undesirable_ascii_characters.remove(10) #keep new line since this might be used for sentence tokenizer
undesirable_charmap = dict.fromkeys(undesirable_ascii_characters)

perform unicode transformation

In [11]:
train_df.Text = train_df.Text.apply(lambda s: s.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore'))
test_df.Text = test_df.Text.apply(lambda s: s.lower().encode('ascii', 'ignore').decode('utf-8', 'ignore'))

In [12]:
train_df.Text = train_df.Text.apply(lambda s: str(s).translate(undesirable_charmap))
test_df.Text = test_df.Text.apply(lambda s: str(s).translate(undesirable_charmap))

In [13]:
train_df.Gene = train_df.Gene.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))
test_df.Gene = test_df.Gene.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))

In [14]:
train_df.Gene = train_df.Gene.apply(lambda s: str(s).translate(undesirable_charmap))
test_df.Gene = test_df.Gene.apply(lambda s: str(s).translate(undesirable_charmap))

In [15]:
train_df.Variation = train_df.Variation.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))
test_df.Variation = test_df.Variation.apply(lambda s: s.lower().encode('utf-8', 'ignore').decode('utf-8', 'ignore'))

In [16]:
train_df.Variation = train_df.Variation.apply(lambda s: str(s).translate(undesirable_charmap))
test_df.Variation = test_df.Variation.apply(lambda s: str(s).translate(undesirable_charmap))

## custom cleaning

In [17]:
train_df.Text = train_df.Text.apply(lambda s: apply_custom_regx(s))
test_df.Text = test_df.Text.apply(lambda s: apply_custom_regx(s))

based on manual reveiw on data

In [18]:
train_df.Text = train_df.Text.apply(lambda s: s.replace('\\t', ''))
test_df.Text = test_df.Text.apply(lambda s: s.replace('\\t', ''))

## tokening

In [19]:
train_df['Sentences'] = train_df.Text.apply(lambda s: sent_tokenize(s))
test_df['Sentences'] = test_df.Text.apply(lambda s: sent_tokenize(s))

In [20]:
train_df.Sentences = train_df.Sentences.apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])
test_df.Sentences = test_df.Sentences.apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

In [21]:
train_df.Gene = train_df.Gene.apply(lambda s: word_tokenize(s))
test_df.Gene = test_df.Gene.apply(lambda s: word_tokenize(s))

In [22]:
train_df.Variation = train_df.Variation.apply(lambda s: word_tokenize(s))
test_df.Variation = test_df.Variation.apply(lambda s: word_tokenize(s))

## cleaning in word level

removing special characters from the beginning and the end
eg.words:

'.black-color',
 '0.0136*',
 '-c-kit',
 '..4',
 '.01this',
 

In [23]:
clean_start_end = lambda doc: [[re.sub('^[^a-zA-z0-9]*|[^a-zA-Z0-9]*$','',w) for w in sent] for sent in doc]
train_df.Sentences = train_df.Sentences.apply(clean_start_end)
test_df.Sentences = test_df.Sentences.apply(clean_start_end)

## saving data frame

In [24]:
train_df.drop(["Text"], axis=1, inplace=True)
test_df.drop(["Text"], axis=1, inplace=True)

save the pandas processed frame

In [25]:
train_df.head()

Unnamed: 0,ID,Gene,Variation,Class,Sentences
0,0,[fam58a],"[truncating, mutations]",1,"[[cyclin-dependent, kinases, , cdks, , regulat..."
1,1,[cbl],[w802*],2,"[[abstract, background, non-small, cell, lung,..."
2,2,[cbl],[q249e],2,"[[abstract, background, non-small, cell, lung,..."
3,3,[cbl],[n454d],3,"[[recent, evidence, has, demonstrated, that, a..."
4,4,[cbl],[l399v],4,"[[oncogenic, mutations, in, the, monomeric, ca..."


In [26]:
test_df.head()

Unnamed: 0,ID,Gene,Variation,Sentences
0,0,[acsl4],[r570s],"[[2, this, mutation, resulted, in, a, myelopro..."
1,1,[naglu],[p521l],"[[abstract, the, large, tumor, suppressor, 1, ..."
2,2,[pah],[l333f],"[[vascular, endothelial, growth, factor, recep..."
3,3,[ing1],[a148d],"[[inflammatory, myofibroblastic, tumor, , imt,..."
4,4,[tmem216],[g77a],"[[abstract, retinoblastoma, is, a, pediatric, ..."


In [27]:
store = pd.HDFStore('processed/stage1/data_frames.h5')
store['train_df'] = train_df
store['test_df'] = test_df
store.close()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['Gene', 'Variation', 'Sentences']]

  exec(code_obj, self.user_global_ns, self.user_ns)


## load data frames

In [28]:
store = pd.HDFStore('processed/stage1/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']
store.close()

In [29]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,ID,Gene,Variation,Class,Sentences
0,0,[fam58a],"[truncating, mutations]",1,"[[cyclin-dependent, kinases, , cdks, , regulat..."
1,1,[cbl],[w802*],2,"[[abstract, background, non-small, cell, lung,..."
2,2,[cbl],[q249e],2,"[[abstract, background, non-small, cell, lung,..."
3,3,[cbl],[n454d],3,"[[recent, evidence, has, demonstrated, that, a..."
4,4,[cbl],[l399v],4,"[[oncogenic, mutations, in, the, monomeric, ca..."


Unnamed: 0,ID,Gene,Variation,Sentences
0,0,[acsl4],[r570s],"[[2, this, mutation, resulted, in, a, myelopro..."
1,1,[naglu],[p521l],"[[abstract, the, large, tumor, suppressor, 1, ..."
2,2,[pah],[l333f],"[[vascular, endothelial, growth, factor, recep..."
3,3,[ing1],[a148d],"[[inflammatory, myofibroblastic, tumor, , imt,..."
4,4,[tmem216],[g77a],"[[abstract, retinoblastoma, is, a, pediatric, ..."


## generate wordidx, vocab_list

### words

In [30]:
train_words = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
train_words = list(itertools.chain.from_iterable(train_words))
len(set(train_words))

286977

In [31]:
test_words = test_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
test_words = list(itertools.chain.from_iterable(test_words))
len(set(test_words))

343861

In [32]:
train_variations = set(list(itertools.chain.from_iterable(train_df.Variation)))
len(train_variations)

3018

In [33]:
test_variations = set(list(itertools.chain.from_iterable(test_df.Variation)))
len(test_variations)

5634

In [34]:
train_genes = set(list(itertools.chain.from_iterable(train_df.Gene)))
len(train_genes)

264

In [35]:
test_genes = set(list(itertools.chain.from_iterable(test_df.Gene)))
len(test_genes)

1397

vocab_words and vocab_wordidx

In [36]:
vocab_words = list(set(train_words) | set(test_words) | train_variations | test_variations | train_genes | test_genes)
len(vocab_words)

352215

add extra words such as start/end of sentence

In [37]:
vocab_words.append("<UNK>")
vocab_words.append("<SOSent>")
vocab_words.append("<EOSent>")
vocab_words.append("<SODoc>")
vocab_words.append("<EODoc>")
len(vocab_words)

352220

In [38]:
vocab_wordidx = {w:i for i,w in enumerate(vocab_words)}
len(vocab_wordidx)

352220

In [39]:
with open('processed/stage1/vocab_words_wordidx.pkl', 'wb') as f:
    pickle.dump((vocab_words, vocab_wordidx), f, protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
with open('processed/stage1/vocab_words_wordidx.pkl', 'rb') as f:
    (vocab_words1, vocab_wordidx1) = pickle.load(f)
len(vocab_words1), len(vocab_wordidx1)

(352220, 352220)

In [45]:
len(train_words), len(test_words)

(36141974, 54273869)

In [42]:
type(test_words[0])

str

In [46]:
with open('processed/stage1/all_text.txt', 'w') as f:
    f.write(" ".join(train_words))
    f.write(" ")
    f.write(" ".join(test_words))
    

## verifying words and text

In [66]:
set(vocab_words1) - set(vocab_words)

{'.hepatic',
 '.h3k27ac',
 'ln2/',
 "'.",
 'powerpointxrcc2/',
 '10p12.33p12.2',
 '~500x',
 'p.gln76x',
 'anchorage-',
 '.references',
 'bm/',
 '.setd2/set2',
 '.single-nucleus',
 'pylori_',
 '.polyclonal',
 'cspv',
 '.4575dela',
 '.tsq-vantage',
 '1.3-',
 'ma-',
 '.677',
 '.large',
 '.078',
 '.cofilin',
 'u-',
 '.constitutional',
 'repoch+',
 '84.',
 '-gcaatatcagccttaggtgcggctc-3',
 '.fletcher',
 'pan-',
 '.yoshimoto',
 '4.3425',
 '//www.uib.no/aasland/chrab/',
 '-cd4',
 "'intermediate",
 '/nf-b',
 '3inst',
 '//exac.broadinstitute.org/',
 '4001600',
 '3538.',
 'g0-',
 '-d910a1/-d910a',
 'mcf10atp53+/+',
 '55+a171',
 'anti-ib-',
 "'kinome",
 '|il',
 'q572*',
 '.anti-bach1',
 'tcga-az-6598-01',
 '.mzb1',
 '81.24',
 '.sozzi',
 '.termination',
 '.very',
 '-tropomyosin',
 '1.0.',
 '4552045880',
 '.cullins-2',
 '.km12',
 'osterix+',
 '.engelman',
 '90.',
 '//www.ncbi.nlm.nih.gov/ncicgap',
 'mmnr~.~',
 'abcg2/',
 '//www.broadinstitute.org/gatk/',
 'jsl~~~~~~~~',
 '.n-t',
 'chk2-',
 'p.y261*'

In [57]:
len(set(vocab_words) - set(vocab_words1))

18580

In [38]:
len(set(vocab_words) & set(vocab_words1))

362302

In [59]:
list(set(vocab_words) & set(vocab_words1))[:100]

['stromal-epithelial',
 'campen',
 'cmv-p300-cha',
 'futility',
 'midthigh',
 'ip-prepared',
 'therapyresistant',
 'l2654',
 'ucp3',
 'c57bl6/129',
 'alsoa',
 'pi3k-akt-mtor',
 'signiflcantiy',
 'ighv-mutated',
 'puhd15-1',
 'n74',
 'low-risk6',
 't241m',
 'p21rasgap',
 'talk',
 '314j18',
 '0.0182',
 'leukemias.5,14',
 '580d',
 'synergistic',
 'coloectomy',
 'c49y',
 'slc2a5',
 'plcepsilon',
 '54.256.0',
 'characterizes',
 'amplificationcodon',
 'ligands.the',
 'h-100',
 'en-v80e',
 'slide7',
 'visconti',
 'psmd3',
 'suppressorthat',
 'd24e4',
 'arg-974',
 'anti-p15/cdkn2b',
 'side-effect,3',
 'analysis7b',
 '2008a',
 'misclassifying',
 '41a',
 'hnf4fl/fl',
 'biopsy/plasma',
 'nfkbie',
 '42',
 'rotate',
 'hypouricemia',
 'rhodamine-coupled',
 '3h2eb',
 'p596',
 'organizationthe',
 '10,14,15',
 'recapitulates',
 'proline/arginine',
 '235k',
 'circumvented',
 'thenbe',
 'minutes.13',
 'e343k',
 'lines2,2',
 'hd048502',
 'chictr-trc-00000397',
 'asns.luc',
 'tle',
 '035188',
 '2f/r',
 'ye

In [41]:
"ig" in vocab_words1

True

In [40]:
vocab_wordidx["ig-"]

283034

In [58]:
s ='_cdk4'
re.sub('^[^a-zA-z0-9]*|[^a-zA-Z0-9]*$','',s)

'_cdk4'

In [65]:
word_tokenize('_cd''k4')

['_cdk4']