## This notebook splits full dataset and generates the train, validation and test subsets and saves them as pandas dataframework in pickle format

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool, cv, CatBoostError
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.metrics import classification_report
from catboost.utils import get_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

## setup file path

In [2]:
## input file
data_dir = "/global/homes/z/zimingy/KE-Catboost/ziming/GO/data/go_aggregated_4.1"
data_file = os.path.join(data_dir, 'go_aggregated_4.1_mixed_updated_normalized_balanced.pkl')
## output file
train_pickle = os.path.join(data_dir, 'splited_dataset', 'train_set.pkl')
val_pickle = os.path.join(data_dir, 'splited_dataset', 'val_set.pkl')
test_pickle = os.path.join(data_dir, 'splited_dataset', 'test_set.pkl')

### load the full dataset

In [3]:
%%time
df = pd.read_pickle(data_file)
null_vals = df.isnull().sum(axis=0)
assert len(null_vals[null_vals != 0]) == 0
df

CPU times: user 2.17 s, sys: 3.85 s, total: 6.02 s
Wall time: 5.93 s


Unnamed: 0,biome,exptype,GO:0043130,GO:0055074,GO:0055117,GO:0046933,GO:0006302,GO:0008643,GO:0043752,GO:0007026,...,GO:0019357,GO:0006527,GO:0004114,GO:0046423,GO:0034194,GO:0032183,GO:0007618,GO:0030097,GO:0004520,GO:0033739
0,root:Engineered:Biogas plant,0,0.020967,0.0,0.0,0.002165,0.009009,0.000020,0.002570,0.0,...,0.037736,0.002910,0.000000,0.0,0.000332,0.0,0.0,0.0,0.009231,0.003180
1,root:Engineered:Biogas plant,0,0.027386,0.0,0.0,0.002714,0.010431,0.000040,0.003160,0.0,...,0.056604,0.003254,0.000278,0.0,0.000310,0.0,0.0,0.0,0.011538,0.003835
2,root:Engineered:Biogas plant,0,0.001712,0.0,0.0,0.004133,0.031769,0.000759,0.006024,0.0,...,0.245283,0.003810,0.000833,0.0,0.000199,0.0,0.0,0.0,0.025928,0.004583
3,root:Engineered:Biogas plant,0,0.000000,0.0,0.0,0.002146,0.021811,0.000180,0.003370,0.0,...,0.037736,0.001349,0.000000,0.0,0.000089,0.0,0.0,0.0,0.014932,0.001871
4,root:Engineered:Biogas plant,0,0.001712,0.0,0.0,0.005841,0.038407,0.000958,0.006698,0.0,...,0.547170,0.004392,0.000000,0.0,0.000532,0.0,0.0,0.0,0.029367,0.007576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,root:Host-associated:Porifera,1,0.007702,0.0,0.0,0.021603,0.014225,0.002696,0.023044,0.0,...,0.000000,0.008784,0.021926,0.0,0.000089,0.0,0.0,0.0,0.078778,0.035170
89996,root:Host-associated:Porifera,1,0.024818,0.0,0.0,0.004508,0.000000,0.000240,0.004297,0.0,...,0.000000,0.002858,0.002035,0.0,0.001196,0.0,0.0,0.0,0.019095,0.011505
89997,root:Host-associated:Porifera,1,0.028241,0.0,0.0,0.001540,0.009009,0.002536,0.000758,0.0,...,0.000000,0.000106,0.035248,0.0,0.000997,0.0,0.0,0.0,0.003891,0.001216
89998,root:Host-associated:Porifera,1,0.000000,0.0,0.0,0.000019,0.000000,0.000000,0.000042,0.0,...,0.000000,0.000026,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000181,0.000094


## split the dataset

In [4]:
## split dataset to train and rest at ratio 7:3
df_train, df_rest = train_test_split(df, train_size=0.7, random_state=42)

In [5]:
assert set(df_train['biome'])==set(df_rest['biome'])

In [6]:
## split dataset 
df_val, df_test = train_test_split(df_rest, train_size=0.67, random_state=42)

In [7]:
assert set(df_train['biome'])==set(df_val['biome'])==set(df_test['biome'])

## save the new generated subsets

In [8]:
## load pd framwork to pickle
df_train.to_pickle(train_pickle)
df_val.to_pickle(val_pickle)
df_test.to_pickle(test_pickle)