In [58]:
import os, sys
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
import glob 
import pickle 


from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
from IPython.core.display import display, HTML    
display(HTML("<style>.container {width:98% !important; }</style>"))
%matplotlib inline
np.set_printoptions(precision=5, suppress=True) 

DATE = datetime.now().strftime('%Y-%m-%d')

In [189]:
import xgboost as xgb

In [180]:
sys.path.append("/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/scripts/rand_forest_ptb_classification/manuscript/0_helper_func")
from manip_trained_models_funcs import unpack_input_data, upickle_xgbmodel, extract_train_df

sys.path.append('/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/scripts/rand_forest_ptb_classification')
from hyperparam_tune import initialize, validate_best_model, create_held_out

In [134]:
# PATHS
ROOT_DATA_DIR = "/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/results/ptb_predict_machine_learning/2019-01-16_xgboost_hyperopt_icd_cpt_raw_counts" 
OUTPUT_DIR = "/dors/capra_lab/users/abraha1/projects/PTB_phenotyping/data/ptb_predict_machine_learning/feature_matrices/rand_matrix"
icd_input = os.path.join(ROOT_DATA_DIR, 'input_data', 'input_data_all_icd9_count_subset-2019-01-25.tsv')
icd_model = os.path.join(ROOT_DATA_DIR, 'best_model','best_xgb_model_all_icd9_count_subset-2019-01-25.pickle')

# functions

In [98]:
def strat_rand_sample(X_input, y_input, n_samples=100, seed=0): 
    # DEPRECATED~
    
    """ 
    Randomly sample input data

    Assume X_input is a dataframe. GRID x Features; y_input is a vector of labels corresponding to GRIDS. 
    Returns a new dataframe with n_samples of rows randomly sampled while maintaining equal proportion of label classes. 

    """    
    # get propotion of labels 
    yvals, ycounts =  np.unique(y_input, return_counts=True)
    
    ylabel_maj = y[0] if (ycounts[0] > ycounts[1]) else y[1]
    ylabel_min = y[1] if (ycounts[0] > ycounts[1]) else y[0]
    
    yratio = ycounts[1]/ycounts[0]
    assert yratio < 1, 'ratio is more than 1'
    
    # determine number of labels to draw
    n_minority = int(np.round(n_samples*yratio))
    n_majority = n_samples - n_minority
    
    # randomly draw required number for each label class
    X_copy = X_input.copy()
    X_copy['y_label'] = y_input
    
    Xymaj_df= X_copy.loc[X_copy['y_label']==ylabel_maj,:].copy()
    Xymin_df= X_copy.loc[X_copy['y_label']==ylabel_min,:].copy()
  
    sampled_Xmaj = Xymaj_df.sample(n=n_majority, random_state=seed).copy()
    sampled_Xmin = Xymin_df.sample(n=n_minority, random_state=seed).copy()
    
    sampled_df = pd.concat([sampled_Xmaj, sampled_Xmin], axis=0)
    
    
    return sampled_df 

In [129]:
def create_rand_data(X_train, y_train, n_samples=1000, seed=0):
    
    
    np.random.seed(seed)
    # create a feature matrix with n_samples (rows) by features (equal to the number of features in X_train)
    X_rand = np.random.randint(0,np.max(np.max(X_train)), size=(n_samples, X_train.shape[1]))
    
    n_pos= int(np.round(n_samples*np.sum(y_train)/len(y_train)))
    
    # create random y_labels in equal proportion as the training data 
    y_vals = np.concatenate( (  np.tile(np.array([1]),n_pos),
                                np.tile(np.array([0]),n_samples-n_pos)))
    
    np.random.shuffle(y_vals) # shuffle in place 
    
    return X_rand, y_vals    


# main: make random data


In [38]:
X_train, y_train, X_test, y_test, xgb_model, this_input_data = unpack_input_data(icd_input, icd_model)

loading input_data_all_icd9_count_subset-2019-01-25.tsv ...
loading best_xgb_model_all_icd9_count_subset-2019-01-25.pickle ...
done loading. took 2.65 minutes




In [130]:
# create random dataset...
X_rand, y_rand = create_rand_data(X_train, y_train, n_samples=1000, seed=0)

In [133]:
X_rand.shape

(1000, 13496)

# format random data

In [174]:
rand_df = pd.DataFrame(X_rand)
rand_df.columns = ['dummy_{}'.format(x) for x in rand_df.columns]
rand_df['GRID'] = ['RAND{}'.format(x) for x in np.arange(0, rand_df.shape[0])]

In [175]:
reordered_cols = ['GRID']
reordered_cols = reordered_cols + list(rand_df.columns.values[:-1])

In [176]:
# write 
final_rand_df = rand_df.loc[:, reordered_cols].copy()
final_rand_df.to_csv(os.path.join(OUTPUT_DIR, 'rand1000_feat_mat.tsv'), index=False)

In [178]:
# split into training and test
rand_Xy_df = final_rand_df.copy()
rand_Xy_df['y_label'] = y_rand

In [179]:
rand_Xy_df.head()

Unnamed: 0,GRID,dummy_0,dummy_1,dummy_2,dummy_3,dummy_4,dummy_5,dummy_6,dummy_7,dummy_8,...,dummy_13487,dummy_13488,dummy_13489,dummy_13490,dummy_13491,dummy_13492,dummy_13493,dummy_13494,dummy_13495,y_label
0,RAND0,684,559,629,192,835,763,707,359,9,...,123,36,272,753,541,364,677,176,478,0
1,RAND1,862,362,457,462,783,199,870,214,426,...,372,650,23,597,507,311,427,278,812,0
2,RAND2,492,264,81,367,851,601,177,622,382,...,665,284,204,573,164,53,817,431,510,0
3,RAND3,310,248,225,468,674,710,827,166,665,...,174,337,228,218,780,809,593,232,787,0
4,RAND4,629,390,652,112,798,226,572,612,238,...,335,17,111,104,839,144,526,140,627,1


In [182]:
X_train, y_train, X_test, y_test, annotated_df = create_held_out(X_rand, y_rand, rand_Xy_df)

In [190]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names = annotated_df.columns[1:-2])
dtest = xgb.DMatrix(X_test, label=y_test, feature_names = annotated_df.columns[1:-2])


In [193]:
annotated_df.reset_index(drop=True).to_feather(os.path.join(OUTPUT_DIR, 'rand1000_feat_mat.tsv.feather'))
dtrain.save_binary(os.path.join(OUTPUT_DIR, 'rand1000_dtrain.dmatrix'))
dtest.save_binary(os.path.join(OUTPUT_DIR, 'rand1000_dtest.dmatrix'))