Skip to content

Commit

Permalink
capstone update 2
Browse files Browse the repository at this point in the history
  • Loading branch information
bicepjai committed Nov 7, 2017
1 parent d92c973 commit 6a29a52
Show file tree
Hide file tree
Showing 5 changed files with 3,726 additions and 610 deletions.
Binary file modified capstone.pdf
Binary file not shown.
220 changes: 182 additions & 38 deletions data_prep/data_expl.ipynb
Expand Up @@ -21,22 +21,14 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T06:04:20.402752Z",
"start_time": "2017-09-26T06:04:04.604699Z"
"end_time": "2017-11-07T02:18:36.438988Z",
"start_time": "2017-11-07T02:18:35.220392Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"outputs": [],
"source": [
"import sys\n",
"import os\n",
Expand Down Expand Up @@ -74,11 +66,11 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T06:04:20.839060Z",
"start_time": "2017-09-26T06:04:20.404127Z"
"end_time": "2017-11-07T02:18:36.796440Z",
"start_time": "2017-11-07T02:18:36.440252Z"
}
},
"outputs": [
Expand All @@ -88,7 +80,7 @@
"['/gpu:0', '/gpu:1']"
]
},
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -107,19 +99,19 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T06:04:20.878069Z",
"start_time": "2017-09-26T06:04:20.840322Z"
"end_time": "2017-11-07T02:18:36.937485Z",
"start_time": "2017-11-07T02:18:36.797605Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using matplotlib backend: TkAgg\n",
"Using matplotlib backend: Qt5Agg\n",
"Populating the interactive namespace from numpy and matplotlib\n"
]
},
Expand All @@ -141,11 +133,11 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T06:04:20.882548Z",
"start_time": "2017-09-26T06:04:20.879515Z"
"end_time": "2017-11-07T02:18:36.941552Z",
"start_time": "2017-11-07T02:18:36.938835Z"
}
},
"outputs": [],
Expand All @@ -171,16 +163,28 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T06:04:41.279868Z",
"start_time": "2017-09-26T06:04:20.883650Z"
"end_time": "2017-11-07T02:18:29.936865Z",
"start_time": "2017-11-07T02:18:29.851948Z"
}
},
"outputs": [],
"outputs": [
{
"ename": "NameError",
"evalue": "name 'pd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-720aca14c334>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstore\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHDFStore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'processed/stage1/data_frames.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mtrain_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstore\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'train_df'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtest_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstore\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'test_df'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mstore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
],
"source": [
"store = pd.HDFStore('processed/stage2/data_frames.h5')\n",
"store = pd.HDFStore('processed/stage1/data_frames.h5')\n",
"train_df = store['train_df']\n",
"test_df = store['test_df']\n",
"store.close()"
Expand All @@ -191,8 +195,8 @@
"execution_count": 6,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T06:04:41.626682Z",
"start_time": "2017-09-26T06:04:41.281097Z"
"end_time": "2017-11-07T01:53:37.888075Z",
"start_time": "2017-11-07T01:53:37.584607Z"
}
},
"outputs": [
Expand Down Expand Up @@ -376,11 +380,11 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T03:39:15.466310Z",
"start_time": "2017-09-26T03:39:15.459553Z"
"end_time": "2017-11-07T01:53:37.891641Z",
"start_time": "2017-11-07T01:53:37.889105Z"
}
},
"outputs": [
Expand All @@ -400,11 +404,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2017-09-26T03:39:16.686322Z",
"start_time": "2017-09-26T03:39:16.495808Z"
"end_time": "2017-11-07T01:53:38.009548Z",
"start_time": "2017-11-07T01:53:37.892945Z"
}
},
"outputs": [
Expand All @@ -414,7 +418,7 @@
"(364610, 364610)"
]
},
"execution_count": 9,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1045,6 +1049,140 @@
"source": [
"train_words"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Text Data and classes exploration"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-07T01:54:49.050141Z",
"start_time": "2017-11-07T01:54:49.040657Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"7 1692\n",
"4 1397\n",
"2 1193\n",
"1 990\n",
"6 983\n",
"5 930\n",
"3 739\n",
"8 688\n",
"9 377\n",
"Name: Class, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.Class.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2017-11-07T01:56:00.652668Z",
"start_time": "2017-11-07T01:55:59.761603Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"2 [[vascular, endothelial, growth, factor, recep...\n",
"16 [[janus, jak, tyrosine, kinases, contain, a, t...\n",
"25 [[the, ten-eleven, translocation, 1, tet1, gen...\n",
"27 [[myc, expression, is, deregulated, in, a, wid...\n",
"32 [[individuals, with, neurofibromatosis, type, ...\n",
"48 [[this, 12-week, clinical, study, evaluated, t...\n",
"87 [[quality, control, mechanisms, promote, aggre...\n",
"93 [[c-kit, is, a, member, of, the, type, 3, subc...\n",
"96 [[the, were, premature, closely, derived, to, ...\n",
"101 [[kabuki, syndrome, ks, is, a, multiple, conge...\n",
"108 [[eml4alk, fusions, define, a, subset, of, lun...\n",
"111 [[e26, transformation-specific, ets, transcrip...\n",
"113 [[activating, mutations, in, jak1, and, jak2, ...\n",
"114 [[a, p53, hot-spot, mutation, found, frequentl...\n",
"117 [[establishedmsh6-null, mice, present, afreque...\n",
"133 [[recent, evidence, identified, a, genetic, an...\n",
"155 [[ligand-induced, phosphorylation, of, the, re...\n",
"157 [[introduction, anaplastic, lymphoma, kinase, ...\n",
"165 [[kinase, domain, kd, mutations, of, bcr-abl, ...\n",
"174 [[acquired, somatic, mutations, in, _atrx, an,...\n",
"186 [[the, maintenance, of, genomic, integrity, du...\n",
"187 [[inactivation, of, ras, gtpase, activating, p...\n",
"191 [[the, oncogenic, property, of, anaplastic, ly...\n",
"194 [[ezh2, enhancer, of, zeste, homolog, 2, is, a...\n",
"203 [[acetylation, of, multiple, lysine, residues,...\n",
"210 [[the, thyroid, trk-t3, oncogene, results, fro...\n",
"212 [[interaction, of, tcf7l2, with, translocated,...\n",
"225 [[personalized, therapy, provides, the, best, ...\n",
"251 [[endometrial, stromal, sarcomas, ess, are, ge...\n",
"254 [[extracellular, signal-regulated, kinase-1, a...\n",
" ... \n",
"8898 [[we, identified, novel, gene, fusions, in, pa...\n",
"8902 [[the, congenital, fibrosarcoma, t, 12, 15, p1...\n",
"8903 [[pediatric, high-grade, glioma, hgg, is, a, d...\n",
"8905 [[pediatric, high-grade, glioma, hgg, is, a, d...\n",
"8908 [[lung, cancer, is, the, leading, cause, of, c...\n",
"8910 [[lung, cancer, is, the, leading, cause, of, c...\n",
"8911 [[collagen, is, an, important, extracellular, ...\n",
"8912 [[collagen, is, an, important, extracellular, ...\n",
"8913 [[lung, cancer, is, the, leading, cause, of, c...\n",
"8919 [[head, and, neck, squamous, cell, carcinoma, ...\n",
"8925 [[rearrangements, of, the, proto-oncogene, ret...\n",
"8926 [[the, ret, protooncogene, mutations, responsi...\n",
"8927 [[we, investigatedthe, transformingactivityoft...\n",
"8929 [[we, investigatedthe, transformingactivityoft...\n",
"8930 [[ret, is, a, single-pass, transmembrane, rece...\n",
"8931 [[somatic, ret, mutations, have, been, identie...\n",
"8932 [[we, investigatedthe, transformingactivityoft...\n",
"8933 [[we, investigatedthe, transformingactivityoft...\n",
"8939 [[mutations, of, the, ret, receptor, tyrosine,...\n",
"8946 [[ret, is, a, single-pass, transmembrane, rece...\n",
"8947 [[activating, germ-line, point, mutations, in,...\n",
"8952 [[a, considerable, proportion, of, ladcs, the,...\n",
"8954 [[constitutive, activation, of, the, ret, rece...\n",
"8956 [[we, investigatedthe, transformingactivityoft...\n",
"8959 [[we, investigatedthe, transformingactivityoft...\n",
"8960 [[we, investigatedthe, transformingactivityoft...\n",
"8962 [[introduction, inherited, germ, line, activat...\n",
"8964 [[the, ret, proto-oncogene, encodes, a, recept...\n",
"8976 [[familial, platelet, disorder, with, propensi...\n",
"8978 [[runx, proteins, belong, to, a, family, of, m...\n",
"Name: Sentences, Length: 1692, dtype: object"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.Sentences[train_df.Class == 7]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down Expand Up @@ -1072,7 +1210,13 @@
"sideBar": true,
"skip_h1_title": false,
"toc_cell": false,
"toc_position": {},
"toc_position": {
"height": "827px",
"left": "0px",
"right": "1350px",
"top": "52px",
"width": "212px"
},
"toc_section_display": "block",
"toc_window_display": true
}
Expand Down

0 comments on commit 6a29a52

Please sign in to comment.