# Preprocessing txt

Notebook to preprocess the training dataset with the unormalized features. <br/>
File the comes in the *.txt* format, separated with tabulations.

## Imports

In [1]:
import pandas as pd
import numpy as np

## Constants

In [2]:
PATH_DATA = "../data/Matrix_notnormalized_removedlowlyexpressedgenes.txt"

# Loading data

Reading txt as a csv.

In [3]:
df = pd.read_csv(PATH_DATA, sep="\t")
print("Df shape:", df.shape)
df.head()

Df shape: (15104, 29)


Unnamed: 0,369_M01_CTRL,369_M02_DHT,369_M03_DHT,369_M04_DHT,369_M05_CTRL,369_M07_4L_P4,369_M07_4R_P4,369_M09_P4,371_M01_DHT,371_M02_CTRL,...,318_P4,320_CTRL,320_DHT,320_P4,328_CTRL,328_DHT,328_P4,330_CTRL,330_DHT,330_P4
TSPAN6,390,1594,1029,1061,346,293,361,848,400,67,...,496,88,1121,1012,617,861,765,1115,986,679
TNMD,21,34,11,28,30,2,18,10,6,0,...,135,15,70,113,32,16,16,33,5,14
DPM1,154,474,255,333,175,111,142,220,110,26,...,464,146,495,670,395,389,566,640,555,442
SCYL3,268,1138,685,690,267,244,288,546,231,61,...,298,138,514,247,224,251,232,230,282,248
C1orf112,43,122,70,98,38,26,31,72,45,8,...,79,45,98,111,55,80,110,113,94,87


The shape is **wrong**. The data are **transposed**. We have actually 29 samples and 15104 features.

In [4]:
df = df.transpose()
df = df.sort_index()
print("Df shape:", df.shape)
df.head()

Df shape: (29, 15104)


Unnamed: 0,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,CFH,FUCA2,GCLC,NFYA,STPG1,...,AL117339.5,AC069544.2,AC006460.2,AL356275.1,AC092329.4,MIR205,AC066615.1,AC105233.5,AC073111.5,AC079781.5
318_CTRL,575,362,450,310,119,94,357,313,248,81,...,38,28,14,70,71,42,25,17,29,26
318_DHT,1559,15,555,293,126,27,316,308,449,69,...,31,75,24,42,173,32,36,31,19,16
318_P4,496,135,464,298,79,80,252,258,315,95,...,31,61,27,38,81,12,33,31,33,27
320_CTRL,88,15,146,138,45,0,81,54,123,41,...,4,36,0,12,14,19,0,16,16,13
320_DHT,1121,70,495,514,98,17,320,238,209,60,...,99,99,14,51,76,52,18,70,34,35


# Keeping relevant features

In [5]:
relevant_features = pd.read_csv("../data/important_genes.csv", sep=",")
relevant_features.head()

Unnamed: 0,value
0,KLK3
1,ITIH2
2,SULT1C3
3,SCGB2A1
4,KLK2


In [6]:
relevant_features_array = relevant_features.values.flatten()
relevant_features_array

array(['KLK3', 'ITIH2', 'SULT1C3', 'SCGB2A1', 'KLK2', 'LRP2', 'KLK3',
       'CELSR1', 'SLC26A3', 'GRIK1', 'CXCL13', 'SCGB1D2', 'CUX2', 'GRIK3',
       'KLK2', 'SCGB2A2', 'UGT2B28', 'CAPN8', 'SLC26A3', 'PGR', 'GPR88',
       'CPA3', 'TENM1', 'GPC3', 'PGR', 'DCHS2', 'GP2', 'ELOVL2', 'SOX11',
       'ALOX15B', 'SUSD3', 'SLCO1A2', 'UGT2B11', 'CYP4Z1', 'RNF150',
       'CYP4F8', 'CITED1', 'SYNPO2', 'ATP1A2', 'PDZK1', 'TMEM158',
       'RANBP3L', 'PIWIL4', 'SULT1C2P1', 'FGFR2', 'ROBO2', 'KIAA1549L',
       'RAB3B', 'CLEC7A', 'ADCY2', 'SPHKAP', 'CNMD', 'SERHL2', 'HPGD',
       'GLYATL1', 'FGFR4'], dtype=object)

In [7]:
df = df[relevant_features_array]
print("Df shape:", df.shape)
df.head()

Df shape: (29, 56)


Unnamed: 0,KLK3,ITIH2,SULT1C3,SCGB2A1,KLK2,LRP2,KLK3.1,CELSR1,SLC26A3,GRIK1,...,KIAA1549L,RAB3B,CLEC7A,ADCY2,SPHKAP,CNMD,SERHL2,HPGD,GLYATL1,FGFR4
318_CTRL,1,92,0,172,6,139,1,641,0,33,...,15,66,36,6,103,13,46,169,3,16
318_DHT,53,7,0,84,239,47,53,439,315,0,...,7,461,112,1058,179,95,273,69,44,177
318_P4,10,41,0,54,41,70,10,230,8,8,...,62,45,118,32,320,24,53,146,4,8
320_CTRL,0,60,0,144,0,345,0,1423,1,43,...,17,17,31,12,47,7,54,4,0,23
320_DHT,576,4,6,188,511,50,576,165,1121,2,...,12,107,110,258,222,69,2428,165,35,474


# Add labels

In [8]:
def convert_symbol_to_label(sequence):
    label_type = sequence.split("_")[-1]
    if label_type == "CTRL":
        return 0    
    elif label_type == "DHT":
        return 1    
    elif label_type == "P4":
        return 2
    raise Exception("Wrong label:", sequence)

In [9]:
df = df.reset_index()
df = df.rename(columns={"index":"label"})
df.head()

Unnamed: 0,label,KLK3,ITIH2,SULT1C3,SCGB2A1,KLK2,LRP2,KLK3.1,CELSR1,SLC26A3,...,KIAA1549L,RAB3B,CLEC7A,ADCY2,SPHKAP,CNMD,SERHL2,HPGD,GLYATL1,FGFR4
0,318_CTRL,1,92,0,172,6,139,1,641,0,...,15,66,36,6,103,13,46,169,3,16
1,318_DHT,53,7,0,84,239,47,53,439,315,...,7,461,112,1058,179,95,273,69,44,177
2,318_P4,10,41,0,54,41,70,10,230,8,...,62,45,118,32,320,24,53,146,4,8
3,320_CTRL,0,60,0,144,0,345,0,1423,1,...,17,17,31,12,47,7,54,4,0,23
4,320_DHT,576,4,6,188,511,50,576,165,1121,...,12,107,110,258,222,69,2428,165,35,474


In [10]:
df["label"] = df["label"].apply(convert_symbol_to_label)
df.head()

Unnamed: 0,label,KLK3,ITIH2,SULT1C3,SCGB2A1,KLK2,LRP2,KLK3.1,CELSR1,SLC26A3,...,KIAA1549L,RAB3B,CLEC7A,ADCY2,SPHKAP,CNMD,SERHL2,HPGD,GLYATL1,FGFR4
0,0,1,92,0,172,6,139,1,641,0,...,15,66,36,6,103,13,46,169,3,16
1,1,53,7,0,84,239,47,53,439,315,...,7,461,112,1058,179,95,273,69,44,177
2,2,10,41,0,54,41,70,10,230,8,...,62,45,118,32,320,24,53,146,4,8
3,0,0,60,0,144,0,345,0,1423,1,...,17,17,31,12,47,7,54,4,0,23
4,1,576,4,6,188,511,50,576,165,1121,...,12,107,110,258,222,69,2428,165,35,474


# Save Dataframe

In [11]:
df.to_csv("../data/train_not_normalized.csv", index=False)