In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os

from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
# allow for imports from parent folder
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.dataset import make_dataset
from src.preprocessing import build_features
from src.preprocessing import data_cleaning
from src.preprocessing import preprocessing
from src.models import train_model
from src.visualizations import visualize

In [3]:
# set random state
rand_state = 1

# Load files into memory

In [None]:
counts_filename = 'data/count_data_species_raw_WIS_overlapping_fungi_bacteria_12773samples.tsv'
metadata_filename = 'data/metadata_species_WIS_overlapping_fungi_bacteria_12773samples.tsv'

In [None]:
counts = md.read_fungi_data(counts_filename)
metadata = md.read_fungi_data(metadata_filename)

metadata = metadata.replace('Not available', np.nan)

#TODO Merge this later?
#combined = pd.merge(metadata, counts, on="sampleid", how="inner")

print('Metadata Shape:\t' + str(metadata.shape))
print('Counts Shape:\t' + str(counts.shape))
#print('Combined Shape:\t' + str(combined.shape))

In [None]:
# # note 76% of samples dont have DTD
# print("Days to Death - NA: " + str(round(combined["days_to_death"].isna().mean(), 3) * 100) + '%')

# # 3036 data points to work with
# print("Rows remaining: " + str(combined["days_to_death"].notna().sum()))

# plt.boxplot(combined['days_to_death'], vert=False)
# plt.show()

In [None]:
metadata["pathologic_stage_label"]

In [None]:
combined = pd.merge(metadata, counts, on="sampleid", how="inner")

combined = dc.filter_metadata(combined)

combined['pathologic_t_label'] = dc.reduce_stages(combined['pathologic_t_label'])
combined['pathologic_n_label'] = dc.reduce_stages(combined['pathologic_n_label'])
combined['pathologic_stage_label'] = dc.reduce_stages(combined['pathologic_stage_label'])

combined.shape

## Metadata Regression

In [None]:
# TODO Create OrdinalEncoder for ordinal features in preprocessing.py

In [None]:
pp.preprocess_metadata(metadata).head()

## Regression Model

In [None]:
reg_data = combined[combined["days_to_death"].notna()]

In [None]:
reg_data.shape

In [None]:
# separate X and Y and generate 

test_prop = 0.1
reg_X = reg_data.drop(columns=metadata.columns)
reg_Y = reg_data["days_to_death"]
reg_Xtrain, reg_Xtest, reg_Ytrain, reg_Ytest = train_test_split(reg_X, reg_Y, test_size=test_prop, random_state=rand_state)

In [None]:
reg_model = LinearRegression()
reg_model.fit(reg_Xtrain, reg_Ytrain)
preds = reg_model.predict(reg_Xtest)
preds = [x if x > 0 else 0 for x in preds] # Replace negative predictions
scores = mean_squared_error(reg_Ytest, preds)
scores

In [None]:
(reg_Ytest - preds).head()

## run.py test

In [4]:
args = ["cs"]

In [6]:
if "test" in args:
    metadata_filename = "data/test/test_metadata.tsv"
    counts_filename = "data/test/test_fungi.tsv"
else:
    counts_filename = 'data/count_data_species_raw_WIS_overlapping_fungi_bacteria_12773samples.tsv'
    metadata_filename = 'data/metadata_species_WIS_overlapping_fungi_bacteria_12773samples.tsv'

counts_filename = os.path.join(os.pardir, counts_filename)    
metadata_filename = os.path.join(os.pardir, metadata_filename)   

# load fungi counts and metadata into
counts = make_dataset.read_fungi_data(counts_filename)
raw_metadata = make_dataset.read_fungi_data(metadata_filename)
metadata = raw_metadata.replace('Not available', np.nan)
metadata.shape

(12773, 41)

In [7]:
cancer_stage = "pathologic_stage_label"
# clean cancer stage column s.t. only stages I, II, III, and IV remain
metadata[cancer_stage] = data_cleaning.reduce_stages(metadata[cancer_stage])
metadata = metadata[metadata.pathologic_stage_label.isin(["Stage I", "Stage II", "Stage III", "Stage IV"])]
counts = counts.loc[metadata.index]

Y = build_features.OHE_col(metadata[cancer_stage])
metadata.shape

(8643, 41)

In [8]:
X = metadata.drop(cancer_stage, axis=1)
X = X.replace(np.nan, "NAN")
X = preprocessing.preprocess_metadata(X)
#X = pd.merge(X, counts, on="sampleid", how="inner")
X



Unnamed: 0_level_0,analyte_amount,analyte_A260A280Ratio,aliquot_concentration,age_at_diagnosis,days_to_death,x0_RNA-Seq,x0_WGS,x1_FEMALE,x1_MALE,x2_AMERICAN INDIAN OR ALASKA NATIVE,...,x15_Stomach Adenocarcinoma Diffuse Type,x15_Stomach Adenocarcinoma Not Otherwise Specified (NOS),x15_Stomach Intestinal Adenocarcinoma Not Otherwise Specified (NOS),x15_Stomach Intestinal Adenocarcinoma Tubular Type,x15_Stomach Adenocarcinoma Signet Ring Type,x15_Stomach Intestinal Adenocarcinoma Mucinous Type,x15_Stomach Intestinal Adenocarcinoma Papillary Type,x15_Thyroid Papillary Carcinoma - Classical/usual,x15_Thyroid Papillary Carcinoma - Follicular (>= 99% follicular patterned),x15_Thyroid Papillary Carcinoma - Tall Cell (>= 50% tall cell features)
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13722.58cfa82de4b0c9d6adf6a4c2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.494404,,-1.708543,73.0,2361.0
13722.58cfa82de4b0c9d6adf6a4ce,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.268262,2.40654,-1.708543,34.0,NAN
13722.58cfa82de4b0c9d6adf6a46d,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.531953,0.909373,-1.708543,50.0,1142.0
13722.58cfa82de4b0c9d6adf6a52c,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.390489,0.510128,-1.968077,69.0,NAN
13722.58cfa82de4b0c9d6adf6a59b,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.492859,,-1.708543,49.0,NAN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13767.58cfa83ce4b0c9d6adf72fc3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.250956,0.310505,0.108192,73.0,NAN
13767.58cfa832e4b0c9d6adf6d50e,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,-0.423324,1.208806,0.886792,63.0,NAN
13767.58cfa832e4b0c9d6adf6d64a,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.515265,-0.18855,0.367725,50.0,NAN
13767.58cfa830e4b0c9d6adf6c2fc,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,-0.120076,,0.108192,59.0,NAN


In [None]:
%run run.py test cs

In [None]:
%run run.py test dtd

In [None]:
metadata.columns

In [None]:
metadata["days_to_death"]