In [1]:
import numpy as np
import pandas as pd
import os
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold, train_test_split


from sklearn import preprocessing
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from preprocess_data import preprocess
from load_data import load_data, load_files, load_yang

In [2]:
fudan_filepath = 'data/Yang_PRJNA763023/Yang_PRJNA763023_SE/parsed/normalized_results/'

In [12]:
def load_tsv_files(folder):
    # Initialize an empty dictionary to store the variables
    var_dict = {}
    
    # Iterate over the files in the folder
    for file in os.listdir(folder):
        # Only consider files with the '.tsv' extension
        if file.endswith('.tsv'):
            # Load the data into a pandas DataFrame
            df = pd.read_csv(os.path.join(folder, file), sep='\t') # skiprows=1
            if len(df.columns) == 1:
                df = pd.read_csv(os.path.join(folder, file), sep='\t', skiprows=1)
                df = df.transpose()
                df.columns = df.iloc[0]
                df.drop(index=df.index[0], axis=0, inplace=True)
                #print(file)
            else:
                #print(file)
                columnNames = df.columns.tolist()
                firstColName = columnNames[0]
                df.set_index(firstColName, inplace=True)
            # Get the file name without the '.tsv' extension
            name = os.path.splitext(file)[0]
            
            # Assign the DataFrame to a variable with the file name
            globals()[name] = df
            
            # Add the variable to the dictionary with the file name as the key
            var_dict[name] = globals()[name]
    
    # Return the dictionary of variables
    return var_dict

In [13]:
def load_data(filepath):

    data_files = load_tsv_files(filepath)

    for key, value in data_files.items():
        globals()[key] = value

In [14]:
load_data(fudan_filepath)

In [15]:
global_vars = globals()

In [16]:
file_names = list(("pielou_e_diversity", "simpson_diversity", "phylum_relative", "observed_otus_diversity", "family_relative",
"class_relative", "fb_ratio", "enterotype", "genus_relative", "species_relative", "shannon_diversity", "domain_relative",
"order_relative", "simpson_e_diversity"))

In [17]:
def load_metadata(filepath):
    data = pd.read_csv(filepath, sep=",", usecols=["Run", "disease_stat"])
    data.set_index('Run', inplace=True)
    return data

In [18]:
yang_metadata = load_metadata("data/Yang_PRJNA763023/metadata.csv")

In [24]:
for file_name in file_names:
    dataset = globals()[file_name]
    print(file_name)
    data = dataset.join(yang_metadata)
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    
    le = preprocessing.LabelEncoder()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
    le.fit(y_train)
    y_train = le.transform(y_train)
    y_test = le.transform(y_test)
    
    for col in X_train.columns:
        #find max value of column
        max_value_train = np.nanmax(X_train[col][X_train[col] != np.inf])
        #max_value_train = np.nanmax(X_train['my_column'][X_train['my_column'] != np.inf])
        print(max_value_train)
        #replace inf and -inf in column with max value of column 
        X_train[col].replace([np.inf, -np.inf], max_value_train, inplace=True)
        #drop the inf values from the test set
        X_test = X_test.replace([np.inf, -np.inf], np.nan).dropna()
        #get the respective y when we drop observations from the test set
        

    clf = RandomForestClassifier(n_estimators= 100, max_depth=50, random_state=1234, class_weight={0: 1})
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    nu = np.unique(y_test, return_counts=True)
    print(nu)
    p_r_f1_support = precision_recall_fscore_support(y_test, y_pred, average='macro')
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print('The results for ' + file_name + " are: accuracy " + str(acc) + ", conf.mat.:" + str(cm) + ", roc auc:" + str(roc_auc) + ", precision, recall, f1:" + str(p_r_f1_support))
    

pielou_e_diversity
0.8789763060925544
(array([0, 1]), array([112, 107]))
The results for pielou_e_diversity are: accuracy 0.4840182648401826, conf.mat.:[[59 53]
 [60 47]], roc auc:0.48301902536715624, precision, recall, f1:(0.48289915966386554, 0.48301902536715624, 0.48246439550787373, None)
simpson_diversity
0.98669128
(array([0, 1]), array([112, 107]))
The results for simpson_diversity are: accuracy 0.5251141552511416, conf.mat.:[[59 53]
 [51 56]], roc auc:0.5250751001335114, precision, recall, f1:(0.5250625521267723, 0.5250751001335113, 0.525025025025025, None)
phylum_relative
0.002
0.0132
0.8686
0.8804
0.0002
0.0002
0.0068
0.0042
0.0004
0.2662
0.0004
0.0004
0.005
0.0008
0.068
0.0002
0.9878
0.657
0.0024
0.0004
0.001
0.0
0.0192
0.0002
0.9886
0.0
0.0024
0.0758
0.0002
0.568
0.022000000000000002
0.1728
(array([0, 1]), array([112, 107]))
The results for phylum_relative are: accuracy 0.5981735159817352, conf.mat.:[[76 36]
 [52 55]], roc auc:0.5962950600801069, precision, recall, f1:(0.599

ValueError: Found input variables with inconsistent numbers of samples: [219, 211]

In [None]:
#np.any(np.isnan(fb_ratio))
#np.all(np.isfinite(fb_ratio))
#fb_ratio = fb_ratio.replace([np.inf, -np.inf], np.nan).dropna()
#fb_ratio