## This notebook does feature normalization, scale all feature values to range [0, 1]

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler

### Set file path

In [2]:
## add data path
data_dir = "/global/homes/z/zimingy/KE-Catboost/ziming/GO/data/go_aggregated_4.1"

data_file = os.path.join(data_dir, 'go_aggregated_4.1_mixed_updated.pkl')
output_pickle = os.path.join(data_dir, 'go_aggregated_4.1_mixed_updated_normalized.pkl')
output_tsv = os.path.join(data_dir, 'go_aggregated_4.1_mixed_updated_normalized.tsv')

### Load original dataset

In [3]:
%%time
df = pd.read_pickle(data_file)
null_vals = df.isnull().sum(axis=0)
assert len(null_vals[null_vals != 0]) == 0
df

CPU times: user 589 ms, sys: 1.34 s, total: 1.93 s
Wall time: 2.06 s


Unnamed: 0,id,study_id,sample_id,biome,exptype,version,GO:0043130,GO:0055074,GO:0055117,GO:0046933,...,GO:0019357,GO:0006527,GO:0004114,GO:0046423,GO:0034194,GO:0032183,GO:0007618,GO:0030097,GO:0004520,GO:0033739
19,ERZ650344,MGYS00003358,SRS3210273,root:Environmental:Aquatic:Marine:Brackish,assembly,4.1,3,0,0,342,...,2,79,90,0,20,0,0,0,87,38
27,ERZ747221,MGYS00004737,SRS1791943,root:Engineered:Biogas plant,assembly,4.1,49,0,0,450,...,2,110,0,0,15,0,0,0,204,34
29,ERZ761615,MGYS00003390,SRS1984850,root:Engineered:Bioreactor,assembly,4.1,2,0,0,270,...,33,102,2,0,43,0,0,0,93,49
30,ERR2193276,MGYS00005065,ERS2001073,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,26,...,0,12,1,0,15,0,0,0,8,5
31,ERR2193277,MGYS00005065,ERS2001074,root:Host-associated:Mammals,metagenomic,4.1,0,0,0,31,...,0,13,1,0,20,0,0,0,10,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73785,ERZ807429,MGYS00005443,ERS1960449,root:Host-associated:Human:Digestive system:Oral,assembly,4.1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,1
74255,ERZ795020,MGYS00004901,SRS785549,root:Engineered:Wastewater,assembly,4.1,0,0,0,115,...,1,10,0,0,1,0,0,0,41,8
74256,ERZ505245,MGYS00003223,SRS1589217,root:Environmental:Aquatic:Marine,assembly,4.1,1,0,0,336,...,3,54,1,0,7,0,0,0,88,56
74261,ERZ505291,MGYS00003237,SRS1589396,root:Environmental:Aquatic:Marine,assembly,4.1,2,0,0,299,...,2,58,0,0,11,0,0,0,74,53


In [4]:
# %%time 
## Using pandas df directly will generate NaN values
## ~33min 43s
# scaler = MinMaxScaler()
# cols = df.columns[6:]
# df[cols] = pd.DataFrame(scaler.fit_transform(df[cols]), columns=cols)
# df

### Normalization(Scaling)

In [5]:
## extract numerical values from df
x = df[df.columns[6:]].values
x

array([[  3,   0,   0, ...,   0,  87,  38],
       [ 49,   0,   0, ...,   0, 204,  34],
       [  2,   0,   0, ...,   0,  93,  49],
       ...,
       [  1,   0,   0, ...,   0,  88,  56],
       [  2,   0,   0, ...,   0,  74,  53],
       [  0,   0,   0, ...,   0, 107,  39]])

In [6]:
## check the maximum value, min value is 0
x.max()

15834150

In [7]:
## Scale x using MinMaxScaler()
x_scaled =  MinMaxScaler().fit_transform(x)
x_scaled

array([[0.0012837 , 0.        , 0.        , ..., 0.        , 0.00393665,
        0.00355439],
       [0.02096705, 0.        , 0.        , ..., 0.        , 0.00923077,
        0.00318025],
       [0.0008558 , 0.        , 0.        , ..., 0.        , 0.00420814,
        0.00458329],
       ...,
       [0.0004279 , 0.        , 0.        , ..., 0.        , 0.0039819 ,
        0.00523805],
       [0.0008558 , 0.        , 0.        , ..., 0.        , 0.00334842,
        0.00495744],
       [0.        , 0.        , 0.        , ..., 0.        , 0.00484163,
        0.00364793]])

In [8]:
x_scaled.shape

(31939, 4402)

In [9]:
## check max value again
x_scaled.max()

1.0

In [10]:
## update df numerical field with scaled values
df[df.columns[6:]] = x_scaled
df

Unnamed: 0,id,study_id,sample_id,biome,exptype,version,GO:0043130,GO:0055074,GO:0055117,GO:0046933,...,GO:0019357,GO:0006527,GO:0004114,GO:0046423,GO:0034194,GO:0032183,GO:0007618,GO:0030097,GO:0004520,GO:0033739
19,ERZ650344,MGYS00003358,SRS3210273,root:Environmental:Aquatic:Marine:Brackish,assembly,4.1,0.001284,0.0,0.0,0.001645,...,0.037736,0.002090,0.008326,0.0,0.000443,0.0,0.0,0.0,0.003937,0.003554
27,ERZ747221,MGYS00004737,SRS1791943,root:Engineered:Biogas plant,assembly,4.1,0.020967,0.0,0.0,0.002165,...,0.037736,0.002910,0.000000,0.0,0.000332,0.0,0.0,0.0,0.009231,0.003180
29,ERZ761615,MGYS00003390,SRS1984850,root:Engineered:Bioreactor,assembly,4.1,0.000856,0.0,0.0,0.001299,...,0.622642,0.002699,0.000185,0.0,0.000952,0.0,0.0,0.0,0.004208,0.004583
30,ERR2193276,MGYS00005065,ERS2001073,root:Host-associated:Mammals,metagenomic,4.1,0.000000,0.0,0.0,0.000125,...,0.000000,0.000318,0.000093,0.0,0.000332,0.0,0.0,0.0,0.000362,0.000468
31,ERR2193277,MGYS00005065,ERS2001074,root:Host-associated:Mammals,metagenomic,4.1,0.000000,0.0,0.0,0.000149,...,0.000000,0.000344,0.000093,0.0,0.000443,0.0,0.0,0.0,0.000452,0.000748
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73785,ERZ807429,MGYS00005443,ERS1960449,root:Host-associated:Human:Digestive system:Oral,assembly,4.1,0.000000,0.0,0.0,0.000014,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000094
74255,ERZ795020,MGYS00004901,SRS785549,root:Engineered:Wastewater,assembly,4.1,0.000000,0.0,0.0,0.000553,...,0.018868,0.000265,0.000000,0.0,0.000022,0.0,0.0,0.0,0.001855,0.000748
74256,ERZ505245,MGYS00003223,SRS1589217,root:Environmental:Aquatic:Marine,assembly,4.1,0.000428,0.0,0.0,0.001617,...,0.056604,0.001429,0.000093,0.0,0.000155,0.0,0.0,0.0,0.003982,0.005238
74261,ERZ505291,MGYS00003237,SRS1589396,root:Environmental:Aquatic:Marine,assembly,4.1,0.000856,0.0,0.0,0.001439,...,0.037736,0.001535,0.000000,0.0,0.000244,0.0,0.0,0.0,0.003348,0.004957


In [11]:
## save new df to file
df.to_pickle(output_pickle)
df.to_csv(output_tsv, sep="\t")