In [137]:
import re
import os
import requests
import zipfile
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import arff

In [59]:
root_url = 'http://www.kdnuggets.com/data_mining_course/data/'
root_data = 'data'
data_file = 'final_project_data.zip'
train_data_class = 'pp5i_train_class.txt'
train_data_file = 'pp5i_train.gr.csv'
test_data_file = 'pp5i_test.gr.csv'
norm_train_file = 'pp5i.train.norm.tmp'
norm_test_file = 'pp5i.test.norm.tmp'
top_t_value_file = 'pp5i_train.top'

In [23]:
def unzip_file(file_addr,output_dir,remove=False) :
    with zipfile.ZipFile(file_addr,"r") as zip_ref:
        zip_ref.extractall(output_dir)
    if remove :  
        os.remove(file_addr)
    return os.listdir(output_dir)

def download_file(url,output_file,unzip=False,output_dir='') :
    if not os.path.exists(os.path.join(output_dir,output_file)) :
        # Streaming, so we can iterate over the response.
        r = requests.get(url, stream=True)

        # Total size in bytes.
        total_size = int(r.headers.get('content-length', 0)); 
        block_size = 1024
        wrote = 0 
        with open(os.path.join(output_dir,output_file), 'wb') as f:
            for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True):
                wrote = wrote  + len(data)
                f.write(data)
        if total_size != 0 and wrote != total_size:
            print("ERROR, something went wrong")
    if unzip :
        return unzip_file(os.path.join(output_dir,output_file),output_dir,True)
    return os.path.join(output_dir,output_file)

file = download_file(root_url + data_file,data_file,True,root_data)

933KB [00:06, 140KB/s]                           


In [24]:
train_file = './data/' + train_data_file
train_data = pd.read_csv(train_file)
test_file = './data/' + test_data_file
test_data = pd.read_csv(test_file)
train_data.head(5)

Unnamed: 0,SNO,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,A28102_at,30,46,31,31,26,28,35,29,21,...,35,49,31,51,71,68,77,56,41,38
1,AB000114_at,22,31,19,16,26,24,29,20,23,...,38,30,22,24,19,21,22,25,21,17
2,AB000115_at,29,70,12,11,14,13,14,18,10,...,20,205,16,61,62,35,30,65,32,25
3,AB000220_at,76,208,244,39,85,23,634,159,50,...,55,203,152,53,28,30,31,31,27,16
4,AB000409_at,167,211,179,119,161,166,228,267,260,...,137,180,107,147,170,131,132,158,164,172


## Step 1 . Data Cleaning
Threshold both train and test data to a minimum value of 20, maximum of 16,000

In [64]:
def normalize_data(data,save_path) :
    def normalize_row(x) :
        if type(x) == str or (x > 20 and x < 16000) :
            return x
        else :
            if np.abs(x - 16000) > np.abs(x - 20) :
                return 20
            else :
                return 16000
    data = data.applymap(normalize_row)
    data.to_csv(save_path,index=False)
normalize_data(train_data,os.path.join(root_data,norm_train_file))
normalize_data(test_data,os.path.join(root_data,norm_test_file))
norm_train_data = pd.read_csv(os.path.join(root_data,norm_train_file))

## Step 2. Selecting top genes by class
- remove from train data genes with fold differences across samples less than 2
- for each class, generate subsets with top 2,4,6,8,10,12,15,20,25, and 30 top genes with the highest T-value (Optional: for each class, select top genes using highest absolute T-value (i.e. also include genes with high negative T-value)
- for each N=2,4,6,8,10,12,15,20,25,30 combine top genes for each class into one file (removing duplicates, if any) and call the resulting file pp5i_train.topN.gr.csv
- Add the class as the last column, remove sample no, transpose each file to "genes-in-columns" format and convert it to arff.

In [53]:
norm_train_data['fold variation'] = norm_train_data.apply(lambda row: max(row[1:])/min(row[1:]),axis=1)
norm_train_data = norm_train_data[norm_train_data['fold variation'] > 2]
norm_train_data

Unnamed: 0,SNO,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,fold variation
0,A28102_at,30,46,31,31,26,28,35,29,21,...,49,31,51,71,68,77,56,41,38,20.000000
1,AB000114_at,22,31,20,20,26,24,29,20,23,...,30,22,24,20,21,22,25,21,20,20.000000
2,AB000115_at,29,70,20,20,20,20,20,20,20,...,205,20,61,62,35,30,65,32,25,20.000000
3,AB000220_at,76,208,244,39,85,23,634,159,50,...,203,152,53,28,30,31,31,27,20,31.700000
4,AB000409_at,167,211,179,119,161,166,228,267,260,...,180,107,147,170,131,132,158,164,172,39.000000
5,AB000449_at,32,20,229,20,34,43,56,20,20,...,25,20,20,20,20,20,20,20,20,20.000000
6,AB000450_at,20,20,20,20,20,20,21,20,20,...,20,20,20,21,22,20,27,20,20,20.000000
7,AB000460_at,165,184,219,156,206,197,249,138,147,...,184,140,148,154,140,150,173,165,145,137.000000
8,AB000462_at,47,36,41,26,34,40,45,133,40,...,26,55,54,57,37,43,54,74,48,20.000000
9,AB000464_at,101,130,131,86,140,113,121,250,264,...,115,103,105,117,81,128,108,121,118,71.000000


In [54]:
def calculate_attributes(row) :
#     classes = [row[:27],row[28:]]
    avg = []
    std = []
    n = []
    for i,attr in enumerate((row[1:28],row[28:])) :
        n.append(len(attr))
        avg.append(np.mean(attr))
        sum_val = np.sum(attr)
        sum_sq = np.sum(np.square(attr))
        std.append(np.sqrt((n[i]*sum_sq - sum_val*sum_val)/(n[i]*(n[i]-1))))
    row['t_value'] = (avg[0] - avg[1]) / np.sqrt(std[0]*std[0]/n[0] + std[1]*std[1]/n[1])
    return row
p_data = norm_train_data.iloc[:,:].apply(calculate_attributes,axis=1)

In [55]:
p_data

Unnamed: 0,SNO,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,fold variation,t_value
0,A28102_at,30,46,31,31,26,28,35,29,21,...,31,51,71,68,77,56,41,38,20.000000,-4.978734
1,AB000114_at,22,31,20,20,26,24,29,20,23,...,22,24,20,21,22,25,21,20,20.000000,-1.354521
2,AB000115_at,29,70,20,20,20,20,20,20,20,...,20,61,62,35,30,65,32,25,20.000000,-2.846253
3,AB000220_at,76,208,244,39,85,23,634,159,50,...,152,53,28,30,31,31,27,20,31.700000,1.313400
4,AB000409_at,167,211,179,119,161,166,228,267,260,...,107,147,170,131,132,158,164,172,39.000000,1.455224
5,AB000449_at,32,20,229,20,34,43,56,20,20,...,20,20,20,20,20,20,20,20,20.000000,1.293356
6,AB000450_at,20,20,20,20,20,20,21,20,20,...,20,20,21,22,20,27,20,20,20.000000,-2.182591
7,AB000460_at,165,184,219,156,206,197,249,138,147,...,140,148,154,140,150,173,165,145,137.000000,0.583287
8,AB000462_at,47,36,41,26,34,40,45,133,40,...,55,54,57,37,43,54,74,48,20.000000,0.867059
9,AB000464_at,101,130,131,86,140,113,121,250,264,...,103,105,117,81,128,108,121,118,71.000000,1.522580


In [84]:
t_value_largest = p_data.nlargest(30,['t_value'])[['SNO','t_value']]
t_value_largest

Unnamed: 0,SNO,t_value
2773,U16954_at,6.981668
5869,J00219_s_at,6.918566
3704,U79262_at,6.542442
6411,U09087_s_at,6.482098
2016,M60165_cds1_at,5.916275
492,D78012_at,5.848918
1929,M36067_at,5.830931
6121,M14483_rna1_s_at,5.784261
5609,U50822_rna1_s_at,5.629255
1868,M31303_rna1_at,5.628966


In [82]:
def save_nlargest(number,input_data,path):
    data = input_data.nlargest(number,['t_value']).iloc[:,1:-2]
    data.drop_duplicates
    data.to_csv(path+str(number)+'.gr.csv',index=False)
path = os.path.join(root_data,top_t_value_file)
save_nlargest(2,p_data,path)
save_nlargest(4,p_data,path)
save_nlargest(6,p_data,path)
save_nlargest(8,p_data,path)
save_nlargest(10,p_data,path)
save_nlargest(12,p_data,path)
save_nlargest(15,p_data,path)
save_nlargest(20,p_data,path)
save_nlargest(25,p_data,path)
save_nlargest(30,p_data,path)

In [135]:
def transpose_addclass(input):
    return input.transpose().reset_index().assign(Class=pd.read_csv(os.path.join(root_data,train_data_class)).Class)
top2 = transpose_addclass(pd.read_csv(path+'2.gr.csv'))
top4 = transpose_addclass(pd.read_csv(path+'4.gr.csv'))
top6 = transpose_addclass(pd.read_csv(path+'6.gr.csv'))
top8 = transpose_addclass(pd.read_csv(path+'8.gr.csv'))
top10 = transpose_addclass(pd.read_csv(path+'10.gr.csv'))
top12 = transpose_addclass(pd.read_csv(path+'12.gr.csv'))
top15 = transpose_addclass(pd.read_csv(path+'15.gr.csv'))
top20 = transpose_addclass(pd.read_csv(path+'20.gr.csv'))
top25 = transpose_addclass(pd.read_csv(path+'25.gr.csv'))
top30 = transpose_addclass(pd.read_csv(path+'30.gr.csv'))
top30

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,21,22,23,24,25,26,27,28,29,Class
0,1,931,83,179,75,59,688,71,1425,216,...,1883,116,592,717,757,550,149,20,236,MED
1,2,845,74,192,59,125,647,108,959,719,...,1441,299,317,536,537,605,166,68,206,MED
2,3,1107,76,215,62,116,558,106,1175,993,...,1474,230,387,627,940,622,163,116,222,MED
3,4,1187,80,187,88,81,750,117,1180,785,...,1872,143,312,551,683,499,96,48,219,MED
4,5,969,73,263,65,74,507,154,1186,492,...,1391,224,370,540,1571,756,153,79,269,MED
5,6,1172,68,233,84,140,599,116,1283,374,...,1572,135,278,549,1662,664,258,45,216,MED
6,7,1010,87,252,67,120,438,140,1208,205,...,1337,277,281,433,775,748,89,62,211,MED
7,8,588,95,173,105,103,683,55,1644,221,...,2605,77,197,551,905,634,167,120,382,MED
8,9,928,91,200,66,212,1126,76,2175,381,...,1964,104,199,616,2421,865,179,142,445,MED
9,10,469,85,154,64,78,611,102,1635,97,...,2363,82,291,728,454,323,64,83,270,MED


In [140]:
arff.dump(path+'30.gr.arff', top30.values, relation="default", names=df.columns)