### Data splitting to meta data and numericals

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import bz2

In [2]:
INIT_INPUT_DIR = '../metaml/data/'
DATA_ORIGINAL_DIR = '../data-original/'
DATA_DIR = '../data/'
LOG_DIR = '../logs/'

ABUNDANCE_FILE = 'abundance.txt'

In [3]:
def exists(folder, file):
    """
    Check if a file exists in a folder
    """
    return os.path.exists(os.path.join(folder, file))

In [4]:
def convert_bz2_txt(file_name):
    """
    Opens bz2 file, reads the zipped data and writes it into the data directory
    Returns - the path to the unzipped file
    """
    input_file_path = os.path.join(INIT_INPUT_DIR, file_name)

    with bz2.BZ2File(input_file_path, 'rb') as f:
        data = f.read()
    
    output_file_path = os.path.join(DATA_ORIGINAL_DIR, file_name)

    with open(output_file_path, 'wb') as f:
        f.write(data)

    return output_file_path

In [5]:
abundance_file_path_og = None

if(exists(DATA_ORIGINAL_DIR, ABUNDANCE_FILE)):
    abundance_file_path_og = os.path.join(DATA_ORIGINAL_DIR, ABUNDANCE_FILE)
    print(f"File {abundance_file_path_og} already exists, skipping conversion...")
else:
    print("Converting file...")
    abundance_file_path_og = convert_bz2_txt(ABUNDANCE_FILE + '.bz2')
    print("Stored in ", abundance_file_path_og)

assert abundance_file_path_og is not None

File ../data-original/abundance.txt already exists, skipping conversion...


In [6]:
def read_txt(folder_name, file_name):
    """
    Reads the data from the txt file
    Returns - the data as a pandas dataframe
    """
    input_file_path = os.path.join(folder_name, file_name)
    data = pd.read_csv(input_file_path, sep='\t', index_col=0, header=None)
    return data

def store_csv(data: pd.DataFrame, folder_name, file_name):
    """
    Stores the data into a csv file in the data directory
    Returns - the path to the csv file
    """
    output_file_path = os.path.join(folder_name, file_name)
    data.to_csv(output_file_path, index=False)
    return output_file_path

In [7]:
abundance_file_path = None

if(exists(DATA_DIR, ABUNDANCE_FILE)):
    abundance_file_path = os.path.join(DATA_DIR + ABUNDANCE_FILE)
    print(f"File {abundance_file_path} already exists, skipping conversion...")
else:
    print("Converting file...")
    data = read_txt(DATA_ORIGINAL_DIR, ABUNDANCE_FILE)
    data.describe()
    data_transposed = data.transpose()
    abundance_file_path = store_csv(data_transposed, DATA_DIR, ABUNDANCE_FILE)
    print("Stored in ", abundance_file_path)

File ../data/abundance.txt already exists, skipping conversion...


In [8]:
data = pd.read_csv(abundance_file_path)

if(not exists(LOG_DIR, 'abundance_columns.txt')):
    with open(os.path.join(LOG_DIR, 'abundance_columns.txt'), 'w') as f:
        for col in data.columns:
            f.write(col + '\n')

  data = pd.read_csv(abundance_file_path)


In [9]:
# First 211 columns are metadata
metadata = data.iloc[:, :211]

# The rest are abundance values
data_only_abundance_values = data.iloc[:, 211:]

In [10]:
if(not exists(DATA_DIR, 'abundance_values.csv')):
    print("Storing abundance values...")
    data_only_abundance_values.to_csv(os.path.join(DATA_DIR, 'abundance_values.csv'), index=False)

if(not exists(DATA_DIR, 'metadata.csv')):
    print("Storing metadata...")
    metadata.to_csv(os.path.join(DATA_DIR, 'metadata.csv'), index=False)

if(not exists(LOG_DIR, "metadata_columns.txt")):
    print("Storing metadata columns...")
    with open(os.path.join(LOG_DIR, "metadata_columns.txt"), 'w') as f:
        for col in metadata.columns:
            f.write(col + ", " + metadata[col].dtype.name + '\n')

if(not exists(LOG_DIR, 'abundance_values_columns.txt')):
    print("Storing abundance values columns...")
    with open(os.path.join(LOG_DIR, 'abundance_values_columns.txt'), 'w') as f:
        for col in data_only_abundance_values.columns:
            f.write(col + ", " + data_only_abundance_values[col].dtype.name + '\n')

In [11]:
# check for NAN values

nan_values = data_only_abundance_values.isna().sum().sum()
print(f"Number of NAN values: {nan_values}")

Number of NAN values: 0


In [12]:
metadata.head()

Unnamed: 0,dataset_name,sampleID,subjectID,bodysite,disease,age,gender,country,sequencing_technology,pubmedid,...,statins,insulin,oral_anti-diabetic_medication,years_in_sweden,tnm_stage,ajcc_stage,localization,fobt,wif-1_gene_methylation_test,group
0,Candela_Africa,H10,h10,stool,n,40,female,tanzania,Illumina,25981789,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
1,Candela_Africa,H11,h11,stool,n,29,female,tanzania,Illumina,25981789,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
2,Candela_Africa,H12,h12,stool,n,8,female,tanzania,Illumina,25981789,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
3,Candela_Africa,H13,h13,stool,n,34,male,tanzania,Illumina,25981789,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
4,Candela_Africa,H14,h14,stool,n,30,male,tanzania,Illumina,25981789,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd


In [13]:
# Store the describe content in a file

if(not exists(LOG_DIR, 'metadata_describe.txt')):
    with open(os.path.join(LOG_DIR, 'metadata_describe.txt'), 'w') as f:
        # write properties of each column of metadata in a file
        for i, col in enumerate(metadata.columns):
            f.write(f"Number: {i}, Column: {col}\n")
            f.write(str(metadata[col].describe()) + '\n\n')

In [14]:
# is there any unique column in metadata?

unique_columns = []
for col in metadata.columns:
    if len(metadata[col].unique()) == metadata.shape[0]:
        unique_columns.append(col)

print(f"Unique columns: {unique_columns}")
print(f"Number of unique columns: {len(unique_columns)}")

Unique columns: []
Number of unique columns: 0


In [15]:
# add a new column in a new dataframe called UniqueID which is a combination of dataset_name, sampleID and subjectID
metadata_new = metadata.copy()

# Convert all columns to string
metadata_new['dataset_name'] = metadata_new['dataset_name'].astype(str)
metadata_new['sampleID'] = metadata_new['sampleID'].astype(str)
metadata_new['subjectID'] = metadata_new['subjectID'].astype(str)

metadata_new['UniqueID'] = metadata_new['dataset_name'] + "_" +  metadata_new['sampleID'] + "_" + metadata_new['subjectID']

# show UniqueID column in the beginning
metadata_new = metadata_new[['UniqueID'] + [col for col in metadata_new.columns if col != 'UniqueID']]

# Store new metadata in a csv file
if(not exists(DATA_DIR, 'metadata_new.csv')):
    metadata_new.to_csv(os.path.join(DATA_DIR, 'metadata_new.csv'), index=False)

In [16]:
data_only_abundance_values.head()

Unnamed: 0,k__Archaea,k__Archaea|p__Euryarchaeota,k__Archaea|p__Euryarchaeota|c__Methanobacteria,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_smithii|t__Methanobrevibacter_smithii_unclassified,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter|s__Methanobrevibacter_unclassified,k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanosphaera,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Enterococcaceae|g__Enterococcus|s__Enterococcus_gilvus|t__Enterococcus_gilvus_unclassified,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_otakiensis,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Lactobacillaceae|g__Lactobacillus|s__Lactobacillus_otakiensis|t__GCF_000415925,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptococcaceae,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptococcaceae|g__Desulfotomaculum,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptococcaceae|g__Desulfotomaculum|s__Desulfotomaculum_ruminis,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Peptococcaceae|g__Desulfotomaculum|s__Desulfotomaculum_ruminis|t__GCF_000215085,k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Ruminococcaceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__GCF_000209855,k__Bacteria|p__Firmicutes|c__Negativicutes|o__Selenomonadales|f__Veillonellaceae|g__Megasphaera|s__Megasphaera_sp_BV3C16_1,k__Bacteria|p__Firmicutes|c__Negativicutes|o__Selenomonadales|f__Veillonellaceae|g__Megasphaera|s__Megasphaera_sp_BV3C16_1|t__GCF_000478965
0,0.24169,0.24169,0.24169,0.24169,0.24169,0.24169,0.24169,0.24169,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.50621,0.50621,0.50621,0.50621,0.50621,0.50621,0.50621,0.50621,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.30522,0.30522,0.30522,0.30522,0.30522,0.30522,0.30522,0.30522,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.40133,0.40133,0.40133,0.40133,0.40133,0.35994,0.35994,0.35994,0.0,0.04139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.17479,0.17479,0.17479,0.17479,0.17479,0.16331,0.1528,0.1528,0.01051,0.01148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
data_only_abundance_values_new = data_only_abundance_values.copy()
data_only_abundance_values_new['UniqueID'] = metadata_new['UniqueID']

# Show UniqueID column in the beginning
data_only_abundance_values_new = data_only_abundance_values_new[['UniqueID'] + [col for col in data_only_abundance_values_new.columns if col != 'UniqueID']]

# Store new data in a csv file
if(not exists(DATA_DIR, 'abundance_values_new.csv')):
    data_only_abundance_values_new.to_csv(os.path.join(DATA_DIR, 'abundance_values_new.csv'), index=False)

In [18]:
data_new = data.copy()
data_new['UniqueID'] = metadata_new['UniqueID']

# Show UniqueID column in the beginning
data_new = data_new[['UniqueID'] + [col for col in data_new.columns if col != 'UniqueID']]

# Store new data in a csv file
if(not exists(DATA_DIR, 'abundance_with_unique.csv')):
    data_new.to_csv(os.path.join(DATA_DIR, 'abundance_with_unique.csv'), index=False)

In [19]:
# Clean every data in memory except metadata_new and data_only_abundance_values_new
del data
del metadata
del data_only_abundance_values

In [20]:
# total number of items in abundance_values_new excluding the UniqueID column

total_items = data_only_abundance_values_new.shape[0] * data_only_abundance_values_new.shape[1]
total_items -= data_only_abundance_values_new.shape[0] # remove the UniqueID column

print(f"Total number of items in abundance_values_new excluding the UniqueID column: {total_items}")

# zeros in the data

zeros = (data_only_abundance_values_new == 0).sum().sum()
print(f"Number of zeros in the data: {zeros}")

print("Percentage of zeros in the data: ", zeros / total_items * 100)

Total number of items in abundance_values_new excluding the UniqueID column: 11920220
Number of zeros in the data: 11070260
Percentage of zeros in the data:  92.86959468868862


In [21]:
metadata_new.head()

Unnamed: 0,UniqueID,dataset_name,sampleID,subjectID,bodysite,disease,age,gender,country,sequencing_technology,...,statins,insulin,oral_anti-diabetic_medication,years_in_sweden,tnm_stage,ajcc_stage,localization,fobt,wif-1_gene_methylation_test,group
0,Candela_Africa_H10_h10,Candela_Africa,H10,h10,stool,n,40,female,tanzania,Illumina,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
1,Candela_Africa_H11_h11,Candela_Africa,H11,h11,stool,n,29,female,tanzania,Illumina,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
2,Candela_Africa_H12_h12,Candela_Africa,H12,h12,stool,n,8,female,tanzania,Illumina,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
3,Candela_Africa_H13_h13,Candela_Africa,H13,h13,stool,n,34,male,tanzania,Illumina,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd
4,Candela_Africa_H14_h14,Candela_Africa,H14,h14,stool,n,30,male,tanzania,Illumina,...,nd,nd,nd,nd,nd,nd,nd,nd,nd,nd


In [22]:
metadata_new['bodysite'].unique()

array(['stool', 'nd', 'tongue_dorsum', 'buccal_mucosa',
       'supragingival_plaque', 'anterior_nares', 'posterior_fornix',
       'l_retroauricular_crease', 'r_retroauricular_crease', 'saliva',
       'keratinized_gingiva', 'palatine_tonsils', 'throat',
       'subgingival_plaque', 'vaginal_introitus', 'mid_vagina',
       'hard_palate', 'gomito_dx', 'gomito_sx',
       'solco_retroauricolare_dx', 'solco_retroauricolare_sx', 'skin',
       'mother_faeces', 'infant_faeces', 'milk'], dtype=object)

In [23]:
# Properties of the dataset

print(f"Total number of samples: {metadata_new.shape[0]}")
print(f"Total number of metagenomic numberical coloumns: {data_only_abundance_values_new.shape[1] - 1}")
print(f"Total number of metadata columns: {metadata_new.shape[1] - 1}")

Total number of samples: 3610
Total number of metagenomic numberical coloumns: 3302
Total number of metadata columns: 211


In [24]:
if(not exists(LOG_DIR, 'metadata_cols_unique.txt')):
    with open(os.path.join(LOG_DIR, 'metadata_cols_unique.txt'), 'w') as f:
        for col in metadata_new.columns:
            f.write(f"Column: {col}, Unique Count: {metadata_new[col].nunique()}\n")
            f.write(str(metadata_new[col].unique()) + '\n\n')