# Data Exploration

# Setup

In [1]:
import sys
import os

import re
import collections
import itertools
import bcolz
import pickle
sys.path.append('../lib')

import numpy as np
import pandas as pd
import gc
import random
import smart_open
import h5py
import csv

import tensorflow as tf
import gensim
import string

import datetime as dt
from tqdm import tqdm_notebook as tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

random_state_number = 967898

Using TensorFlow backend.


In [2]:
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
get_available_gpus()

['/gpu:0', '/gpu:1']

In [3]:
%pylab
%matplotlib inline
%load_ext autoreload

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
color = sns.color_palette()

# Data

## load data frames

In [5]:
store = pd.HDFStore('processed/stage2/data_frames.h5')
train_df = store['train_df']
test_df = store['test_df']
store.close()

In [6]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,index,Class,Gene,Variation,Sentences
0,0,3,[acsl4],[r570s],"[[2, this, mutation, resulted, in, a, myelopro..."
1,1,9,[naglu],[p521l],"[[abstract, the, large, tumor, suppressor, 1, ..."
2,2,7,[pah],[l333f],"[[vascular, endothelial, growth, factor, recep..."
3,3,2,[ing1],[a148d],"[[inflammatory, myofibroblastic, tumor, imt, i..."
4,4,9,[tmem216],[g77a],"[[abstract, retinoblastoma, is, a, pediatric, ..."


Unnamed: 0,Gene,Variation,Sentences
0,[chek2],[h371y],"[[the, incidence, of, breast, cancer, is, incr..."
1,[axin2],"[truncating, mutations]","[[an, unselected, series, of, 310, colorectal,..."
2,[wnt4],[e216g],"[[mycosis, fungoides, and, szary, syndrome, ar..."
3,[sucla2],[g118r],"[[regulated, progression, through, the, cell, ..."
4,[braf],[t599instt],"[[pilocytic, astrocytoma, pa, is, emerging, as..."


In [8]:
print(len(train_df))
print(len(test_df))

8989
986


In [9]:
vocab_words, vocab_wordidx = None, None
with open('processed/stage2/vocab_words_wordidx.pkl', 'rb') as f:
    (vocab_words, vocab_wordidx) = pickle.load(f)
len(vocab_words), len(vocab_wordidx)

(364610, 364610)

# Exploration

## words

In [10]:
train_words = train_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
train_words = list(itertools.chain.from_iterable(train_words))
train_words = set(train_words)
len(train_words)

350604

In [11]:
test_words = test_df.Sentences.apply(lambda ll: list(itertools.chain.from_iterable(ll)))
test_words = list(itertools.chain.from_iterable(test_words))
test_words = set(test_words)
len(test_words)

173470

In [13]:
train_variations = set(list(itertools.chain.from_iterable(train_df.Variation)))
len(train_variations)

8632

In [14]:
test_variations = set(list(itertools.chain.from_iterable(test_df.Variation)))
len(test_variations)

951

In [15]:
train_genes = set(list(itertools.chain.from_iterable(train_df.Gene)))
len(train_genes)

1507

In [16]:
test_genes = set(list(itertools.chain.from_iterable(test_df.Gene)))
len(test_genes)

279

vocab_words and vocab_wordidx

In [17]:
len(train_genes & test_genes)

265

In [19]:
len(train_variations & test_variations)

814

In [20]:
len(train_words & test_words)

161141

In [22]:
print(len(train_variations | test_variations), len(train_words & train_variations), len(train_words & test_variations))
print(len(train_variations | test_variations), len(test_words & train_variations), len(test_words & test_variations))

8769 7760 768
8769 2665 758


In [23]:
print(len(train_genes | test_genes), len(train_words & train_genes), len(train_words & test_genes))
print(len(train_genes | test_genes), len(test_words & train_genes), len(test_words & test_genes))

1521 769 199
1521 616 188


## Sentences

In [24]:
train_df.Sentences[0][0]

['2',
 'this',
 'mutation',
 'resulted',
 'in',
 'a',
 'myeloproliferative',
 'phenotype',
 'including',
 'erythrocytosis',
 'in',
 'a',
 'murine',
 'model',
 'of',
 'retroviral',
 'bone',
 'marrow',
 'transplantation']

In [25]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
no_punctuations = [w for w in train_df.Sentences[0][0] if w not in string.punctuation]
no_punctuations

['2',
 'this',
 'mutation',
 'resulted',
 'in',
 'a',
 'myeloproliferative',
 'phenotype',
 'including',
 'erythrocytosis',
 'in',
 'a',
 'murine',
 'model',
 'of',
 'retroviral',
 'bone',
 'marrow',
 'transplantation']

In [29]:
train_sentence_counts = train_df.Sentences.apply(lambda document: len(document))
train_sentence_counts.describe()

count    8989.000000
mean      324.372344
std       217.407948
min         1.000000
25%       204.000000
50%       283.000000
75%       380.000000
max      3119.000000
Name: Sentences, dtype: float64

In [31]:
train_words_in_sentences = train_df.Sentences.apply(lambda document: np.mean([len(sentence) for sentence in document]))
train_words_in_sentences.describe()

count    8989.000000
mean       27.435461
std         3.914051
min         1.000000
25%        24.972112
50%        27.125000
75%        29.482022
max        52.466667
Name: Sentences, dtype: float64

In [32]:
train_sentences = train_df.Sentences.apply(lambda document: len(document))
train_sentences.describe()

count    8989.000000
mean      324.372344
std       217.407948
min         1.000000
25%       204.000000
50%       283.000000
75%       380.000000
max      3119.000000
Name: Sentences, dtype: float64

In [33]:
test_sentences = test_df.Sentences.apply(lambda document: len(document))
test_sentences.describe()

count     986.000000
mean      337.166329
std       244.728149
min         5.000000
25%       195.000000
50%       281.000000
75%       401.000000
max      2964.000000
Name: Sentences, dtype: float64

## Characters

In [34]:
train_chars_in_sentences = train_df.Sentences.apply(lambda d: np.mean([np.sum([len(w) for w in s]) for s in d]))
train_chars_in_sentences.describe()

count    8989.000000
mean      151.113143
std        22.861147
min         4.000000
25%       136.980435
50%       149.089552
75%       162.356401
max       288.433333
Name: Sentences, dtype: float64

## encoding issues

In [None]:
train_words