In [3]:
import re
import os
import requests
import zipfile
import math
import pandas as pd
import numpy as np
from tqdm import tqdm

In [4]:
root_url = 'http://www.kdnuggets.com/data_mining_course/data/'
root_data = 'data'
train_data_file = 'ALL_AML_gr.thr.train.csv'
data_file = 'ALL_AML_train_processed.zip'

In [5]:
def unzip_file(file_addr,output_dir,remove=False) :
    with zipfile.ZipFile(file_addr,"r") as zip_ref:
        zip_ref.extractall(output_dir)
    if remove :  
        os.remove(file_addr)
    return os.listdir(output_dir)

def download_file(url,output_file,unzip=False,output_dir='') :
    if not os.path.exists(data_file) :
        # Streaming, so we can iterate over the response.
        r = requests.get(url, stream=True)

        # Total size in bytes.
        total_size = int(r.headers.get('content-length', 0)); 
        block_size = 1024
        wrote = 0 
        with open(output_file, 'wb') as f:
            for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True):
                wrote = wrote  + len(data)
                f.write(data)
        if total_size != 0 and wrote != total_size:
            print("ERROR, something went wrong")
    if unzip :
        return unzip_file(output_file,output_dir,True)
    return output_file

files = download_file(root_url + data_file,data_file,True,root_data)

381KB [00:03, 105KB/s]                                                                                                 


In [6]:
pd.read_csv(os.path.join(root_data,files[0])) \
            .to_csv(os.path.join(root_data,train_data_file),index=False)

In [7]:
data = pd.read_csv(os.path.join(root_data,train_data_file))

In [8]:
data['fold variation'] = data.apply(lambda row: max(row[1:])/min(row[1:]),axis=1)

In [9]:
# data.sort_values('fold variation', axis=0,ascending=False)
data['fold variation'][data['fold variation'].idxmax()]

800.0

In [10]:
rows_with_large_fold = data.loc[data['fold variation'] == max(data['fold variation'])]
print('Largest fold diffrence is {} and {} rows have it'.format(max(rows_with_large_fold['fold variation']),len(rows_with_large_fold)))

Largest fold diffrence is 800.0 and 17 rows have it


In [11]:
rows_with_low_fold = data.loc[data['fold variation'] == min(data['fold variation'])]
print('Lowest fold diffrence is {} and {} rows have it'.format(min(rows_with_low_fold['fold variation']),len(rows_with_low_fold)))

Lowest fold diffrence is 1.0 and 476 rows have it


In [53]:
from IPython.display import display, HTML
def and_op(x,y) :
    return [a and b for a, b in zip(x, y)]

val = np.array(data['fold variation'])
sum(list(2 < val) and list(val<= 4))

count_table = pd.DataFrame({ 
    'val <= 2': [sum(list(val <= 2))],
    '2 < val <= 4': [sum(and_op(list(2 < val),list(val<= 4)))],
    '4 < val <= 8': [sum(and_op(list(4 < val),list(val<= 8)))],
    '8 < val <= 16': [sum(and_op(list(8 < val),list(val<= 16)))],
    '16 < val <= 32': [sum(and_op(list(16 < val),list(val<= 32)))],
    '32 < val <= 64': [sum(and_op(list(32 < val),list(val<= 64)))],
    '64 < val <= 128': [sum(and_op(list(64 < val),list(val<= 128)))],
    '128 < val <= 256': [sum(and_op(list(128 < val),list(val<= 256)))],
    '256 < val <= 512': [sum(and_op(list(256 < val),list(val<= 512)))],
    '512 < val': [sum(list(512 < val))]
}).transpose()
count_table.columns = ['count']
count_table.index.name = 'range'
count_table.reset_index(level=0, inplace=True)
count_table

Unnamed: 0,range,count
0,128 < val <= 256,183
1,16 < val <= 32,1387
2,2 < val <= 4,469
3,256 < val <= 512,88
4,32 < val <= 64,840
5,4 < val <= 8,1363
6,512 < val,46
7,64 < val <= 128,407
8,8 < val <= 16,1643
9,val <= 2,644


In [84]:
def calculate_attributes(row) :
    classes = [row[:27],row[28:]]
    avg = []
    std = []
    n = []
    for i,attr in enumerate((row[1:28],row[28:])) :
        n.append(len(attr))
        avg.append(np.mean(attr))
        sum_val = np.sum(attr)
        sum_sq = np.sum(np.square(attr))
        std.append(np.sqrt((n[i]*sum_sq - sum_val*sum_val)/(n[i]*(n[i]-1))))
    row['s2n'] = s2n = (avg[0] - avg[1]) / (std[0] + std[1])
    row['t_value'] = (avg[0] - avg[1]) / np.sqrt(std[0]*std[0]/n[0] + std[1]*std[1]/n[1])
    return row
p_data = data.iloc[:,:-1].apply(calculate_attributes,axis=1)

  if sys.path[0] == '':
  del sys.path[0]


In [85]:
p_data

Unnamed: 0,ID,1,2,3,4,5,6,7,8,9,...,37,38,28,29,30,31,32,33,s2n,t_value
0,A28102_at,151,263,88,484,118,270,458,872,62,...,249,357,190,318,382,486,388,260,-0.174741,-1.054940
1,AB000114_at,72,21,20,61,20,85,20,25,20,...,36,20,39,20,20,20,56,20,0.468299,2.862434
2,AB000115_at,281,250,358,118,197,71,168,296,198,...,328,74,214,103,239,221,405,1306,-0.008948,-0.052813
3,AB000220_at,36,43,42,39,39,32,20,59,27,...,20,51,71,20,72,39,192,32,-0.172622,-0.836041
4,AB000409_at,20,20,142,20,237,20,87,20,148,...,20,88,20,39,377,20,20,20,-0.167576,-0.798534
5,AB000449_at,57,169,359,274,311,232,131,70,313,...,133,111,178,181,96,120,36,255,0.608903,3.708939
6,AB000450_at,186,219,237,245,186,30,199,556,259,...,131,96,48,47,173,285,113,115,0.091514,0.538497
7,AB000460_at,1647,2043,1997,2128,1608,1354,1784,2911,2117,...,2044,1825,2175,976,1767,1939,1462,2818,-0.025964,-0.145657
8,AB000462_at,137,188,91,20,204,20,52,20,20,...,162,73,340,62,367,34,72,232,-0.053742,-0.309895
9,AB000464_at,803,756,2514,1489,322,750,409,1074,1495,...,867,280,799,379,843,1000,1320,1167,0.103062,0.617827


In [87]:
p_data.nlargest(50,['s2n'])[['ID','s2n']]

Unnamed: 0,ID,s2n
5712,U22376_cds2_s_at,1.339308
4268,X59417_at,1.124637
4170,X52142_at,1.122589
6914,M28170_at,1.116756
2582,U05259_rna1_at,1.103966
2294,M92287_at,1.043056
1246,L13278_at,1.042032
6411,U09087_s_at,1.036257
4475,X74262_at,1.027773
6221,M31211_s_at,1.022881


In [88]:
p_data.nlargest(50,['t_value'])[['ID','t_value']]

Unnamed: 0,ID,t_value
5712,U22376_cds2_s_at,7.9043
4268,X59417_at,6.803106
6221,M31211_s_at,6.294207
6914,M28170_at,6.253971
2294,M92287_at,6.217365
6411,U09087_s_at,6.182425
2582,U05259_rna1_at,6.175126
5533,D26156_s_at,6.097113
1246,L13278_at,6.021342
4475,X74262_at,6.016389
