In [1]:
#load packages
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit
from sklearn.metrics import make_scorer, f1_score
from xgboost import XGBClassifier
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
#MarPRISM
#xgboost model, xgboost features
#training data without contamination or low sequence abundance 
#updated software

In [3]:
#load training data and labels
train = pd.read_csv('../trainingDataMarPRISM.csv')

#get just the TPM values not the MMETSP entry IDs and trophic mode labels
trainData = train.iloc[:, 2:]

#load feature Pfams for model
features = pd.read_csv('../MarPRISM_featurePfams.csv')

In [4]:
#extract pfam column from features dataframe
features = features['pfam']

In [5]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [6]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [7]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [8]:
#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)


In [9]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 1.0} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores
data = {'F1_Scores': f1_scores}

#calculate mean and standard error of F1 scores
mean_f1 = data['F1_Scores'].mean()
std_error_f1 = data['F1_Scores'].std(ddof=1) / np.sqrt(len(data['F1_Scores']))

#create a new DataFrame with the results
df = pd.DataFrame({
    'Model': ['MarPRISM (xgboost model, xgboost feature Pfams, training data without contamination and low sequence abundance, updated software)'],
    'Mean_F1_Score': [mean_f1],
    'Std_Error_F1_Score': [std_error_f1]
})

#define output csv path
csv_file_path = 'model_overall_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [10]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 1.0} #
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each trophic mode separately
trophic_names = [f'Trophic_{i}' for i in range(f1_scores.shape[1])]
data = {trophic_name: f1_scores[:, i] for i, trophic_name in enumerate(trophic_names)}

#prepare a list to store results
results = []

#loop through the data dictionary and calculate mean and standard error for each class
for class_name, f1_scores in data.items():
    mean_f1 = np.mean(f1_scores)
    std_error_f1 = np.std(f1_scores, ddof=1) / np.sqrt(len(f1_scores))  # Standard error
    results.append({
        'Model': 'MarPRISM (xgboost model, xgboost feature Pfams, training data without contamination and low sequence abundance, updated software)',
        'Trophic mode': class_name,
        'Mean_F1_Score': mean_f1,
        'Std_Error_F1_Score': std_error_f1
    })

#create a DataFrame from the results
df = pd.DataFrame(results)

#replace Trophic values with desired labels
df['Trophic mode'] = df['Trophic mode'].replace({'Trophic_0': 'Het', 'Trophic_1': 'Mix', 'Trophic_2': 'Phot'})

#define output csv path
csv_file_path = 'models_byClass_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [11]:
#define a list of values of k to try
k_values = [6]

#define a list of train sizes to iterate through
train_sizes = np.arange(0.05, 1, 0.05)  

#initialize an empty list to store results for each k and train size
results = [] 

#initialize the classifier (XGBoost)
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#loop through different values of k
for k in k_values:
    print(f"\nNumber of splits (k): {k}")
    
    #loop through different train sizes
    for train_size in train_sizes:
        print(f"Train size: {train_size}")
        
        #initialize dictionary to store results for this train size
        f1_scores_dict = {'k': k, 'train_size': train_size}
        
        #initialize StratifiedShuffleSplit cross-validator with the current train_size
        kf = StratifiedShuffleSplit(n_splits=k, train_size=train_size, random_state=7)

        #perform cross-validation and store the results
        class_f1_scores_all = {f'class_{i}_f1': [] for i in range(3)}  #3 classes
        
        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            #create and fit the XGBoost classifier
            clf.fit(X_train, y_train)

            #predict on the test set
            y_pred = clf.predict(X_test)

            #calculate F1 scores by class
            class_f1_scores = f1_score(y_test, y_pred, average=None)

            #store the class F1 scores
            for i, score in enumerate(class_f1_scores):
                class_f1_scores_all[f'class_{i}_f1'].append(score)
        
        #calculate mean and standard error of the F1 scores for this train size
        for key, scores in class_f1_scores_all.items():
            mean_score = np.mean(scores)
            se_score = stats.sem(scores)
            f1_scores_dict[f'{key}_mean'] = mean_score
            f1_scores_dict[f'{key}_se'] = se_score
        
        #append the results for this train size
        results.append(f1_scores_dict)

#convert the results to a DataFrame for easier analysis and visualization
results_df = pd.DataFrame(results)


Number of splits (k): 6
Train size: 0.05
Train size: 0.1
Train size: 0.15000000000000002
Train size: 0.2
Train size: 0.25
Train size: 0.3
Train size: 0.35000000000000003
Train size: 0.4
Train size: 0.45
Train size: 0.5
Train size: 0.55
Train size: 0.6000000000000001
Train size: 0.6500000000000001
Train size: 0.7000000000000001
Train size: 0.7500000000000001
Train size: 0.8
Train size: 0.8500000000000001
Train size: 0.9000000000000001
Train size: 0.9500000000000001


In [12]:
#remove first column with number of folds
results_df.drop(results_df.columns[0], axis=1, inplace=True)

In [13]:
#rename columns
results_df.columns = ['proportion of training data used', 'Het_Mean_F1_Score', 'Het_F1_Std_Error_F1_Score', 
                      'Mix_Mean_F1_Score', 'Mix_F1_Std_Error_F1_Score', 'Phot_Mean_F1_Score', 'Phot_F1_Std_Error_F1_Score']

In [14]:
#save the results of F1 score versus different percentages of 
#training data used to a CSV file
results_df.to_csv('marPRISM_k_train_size_vs_f1_score_by_class.csv')

In [15]:
#generate confusion matrix summed across folds of cross-validation

cumulative_cm = np.zeros((3, 3), dtype=int)  # 3 classes

# Redefine and rerun the loop if needed
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    cumulative_cm += cm

class_names = ['Het', 'Mix', 'Phot']
cm_df = pd.DataFrame(cumulative_cm, index=class_names, columns=class_names)
cm_df.to_csv('marPRISM_cumulative_confusion_matrix.csv')

In [16]:
#xgboost model, xgboost and random forest features
#training data without contamination or low sequence abundance 
#updated software

In [17]:
#get just the TPM values not the MMETSP entry IDs and trophic mode labels
#same training data as above
trainData = train.iloc[:, 2:]

In [18]:
#load xgboost and random forest feature Pfams
features = pd.read_csv('Extracted_Pfams_contaminationLowSeqsRemoved_xgModel_xgRFFeatures.csv')

In [19]:
#extract pfam column from features dataframe
features = features['pfam']

In [20]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [21]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [22]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [23]:
#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)

In [24]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 1.0} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores
data = {'F1_Scores': f1_scores}

#calculate mean and standard error of F1 scores
mean_f1 = data['F1_Scores'].mean()
std_error_f1 = data['F1_Scores'].std(ddof=1) / np.sqrt(len(data['F1_Scores']))

#create a new DataFrame with the results
df = pd.DataFrame({
    'Model': ['xgboost model, xgboost and random forest feature Pfams, training data without contamination and low sequence abundance, updated software'],
    'Mean_F1_Score': [mean_f1],
    'Std_Error_F1_Score': [std_error_f1]
})

#define output csv path
csv_file_path = 'model_overall_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [25]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 1.0} #
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=3, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each trophic mode separately
trophic_names = [f'Trophic_{i}' for i in range(f1_scores.shape[1])]
data = {trophic_name: f1_scores[:, i] for i, trophic_name in enumerate(trophic_names)}

#prepare a list to store results
results = []

#loop through the data dictionary and calculate mean and standard error for each class
for class_name, f1_scores in data.items():
    mean_f1 = np.mean(f1_scores)
    std_error_f1 = np.std(f1_scores, ddof=1) / np.sqrt(len(f1_scores))  # Standard error
    results.append({
        'Model': 'xgboost model, xgboost and random forest feature Pfams, training data without contamination and low sequence abundance, updated software',
        'Trophic mode': class_name,
        'Mean_F1_Score': mean_f1,
        'Std_Error_F1_Score': std_error_f1
    })

#create a DataFrame from the results
df = pd.DataFrame(results)

# Replace Trophic values with desired labels
df['Trophic mode'] = df['Trophic mode'].replace({'Trophic_0': 'Het', 'Trophic_1': 'Mix', 'Trophic_2': 'Phot'})

#define output csv path
csv_file_path = 'models_byClass_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [26]:
#random forest model, random forest model feature pfams
#training data without contamination or low sequence abundance 
#updated software

In [27]:
#get just the TPM values not the MMETSP entry IDs and trophic mode labels
#same training data as above
trainData = train.iloc[:, 2:]

In [28]:
#load random forest feature Pfams
features = pd.read_csv('Extracted_Pfams_contaminationLowSeqsRemoved_rfModel_rfFeatures.csv')

In [29]:
#extract pfam column from features dataframe
features = features['pfam']

In [30]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [31]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [32]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [33]:
#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)

In [34]:
np.random.seed(7)

#{'max_depth': 1000, 'min_samples_leaf': 5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10} 
clf = RandomForestClassifier(max_depth=1000,min_samples_leaf=5,min_samples_split=2,min_weight_fraction_leaf=0.0,n_estimators=10)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores
data = {'F1_Scores': f1_scores}

#calculate mean and standard error of F1 scores
mean_f1 = data['F1_Scores'].mean()
std_error_f1 = data['F1_Scores'].std(ddof=1) / np.sqrt(len(data['F1_Scores']))

#create a new DataFrame with the results
df = pd.DataFrame({
    'Model': ['random forest model, random forest feature Pfams, training data without contamination and low sequence abundance, updated software'],
    'Mean_F1_Score': [mean_f1],
    'Std_Error_F1_Score': [std_error_f1]
})

#define output csv path
csv_file_path = 'model_overall_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [35]:
np.random.seed(7)

#{'max_depth': 1000, 'min_samples_leaf': 5, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10} 
clf = RandomForestClassifier(max_depth=1000,min_samples_leaf=5,min_samples_split=2,min_weight_fraction_leaf=0.0,n_estimators=10)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each trophic mode separately
trophic_names = [f'Trophic_{i}' for i in range(f1_scores.shape[1])]
data = {trophic_name: f1_scores[:, i] for i, trophic_name in enumerate(trophic_names)}

#prepare a list to store results
results = []

#loop through the data dictionary and calculate mean and standard error for each class
for class_name, f1_scores in data.items():
    mean_f1 = np.mean(f1_scores)
    std_error_f1 = np.std(f1_scores, ddof=1) / np.sqrt(len(f1_scores))  # Standard error
    results.append({
        'Model': 'random forest model, random forest feature Pfams, training data without contamination and low sequence abundance, updated software',
        'Trophic mode': class_name,
        'Mean_F1_Score': mean_f1,
        'Std_Error_F1_Score': std_error_f1
    })

#create a DataFrame from the results
df = pd.DataFrame(results)

# Replace Trophic values with desired labels
df['Trophic mode'] = df['Trophic mode'].replace({'Trophic_0': 'Het', 'Trophic_1': 'Mix', 'Trophic_2': 'Phot'})

#define output csv path
csv_file_path = 'models_byClass_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [36]:
#xgboost model, xgboost features
#training data with contamination and low sequence abundance 
#updated software

In [37]:
#load training data and labels for data that includes 
#MMETSP transcriptomes with high contamination and 
#low sequence abundance
train = pd.read_csv('trainingData_withContam_withLowSeqs.csv')

#get just the TPM values not the MMETSP entry IDs and trophic mode labels
trainData = train.iloc[:, 2:]

#load xgboost feature Pfams for training data that includes 
#contamination and low sequence transcriptomes
features = pd.read_csv('Extracted_Pfams_contaminationLowSeqsIncluded_xgModel_xgFeatures.csv')

In [38]:
#extract pfam column from features dataframe
features = features['pfam']

In [39]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [40]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [41]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [42]:
#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)

In [43]:
np.random.seed(7)

#{'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 0.5}  
clf = XGBClassifier(gamma=0.5, learning_rate=0.1, max_depth=3, n_estimators=100, reg_lambda=0.5)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores
data = {'F1_Scores': f1_scores}

#calculate mean and standard error of F1 scores
mean_f1 = data['F1_Scores'].mean()
std_error_f1 = data['F1_Scores'].std(ddof=1) / np.sqrt(len(data['F1_Scores']))

#create a new DataFrame with the results
df = pd.DataFrame({
    'Model': ['xgboost model, xgboost feature Pfams, training data with contamination and low sequence abundance, updated software'],
    'Mean_F1_Score': [mean_f1],
    'Std_Error_F1_Score': [std_error_f1]
})

#define output csv path
csv_file_path = 'model_overall_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [44]:
np.random.seed(7)

#{'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_lambda': 0.5} 
clf = XGBClassifier(gamma=0.5, learning_rate=0.1, max_depth=3, n_estimators=100, reg_lambda=0.5)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each trophic mode separately
trophic_names = [f'Trophic_{i}' for i in range(f1_scores.shape[1])]
data = {trophic_name: f1_scores[:, i] for i, trophic_name in enumerate(trophic_names)}

#prepare a list to store results
results = []

#loop through the data dictionary and calculate mean and standard error for each class
for class_name, f1_scores in data.items():
    mean_f1 = np.mean(f1_scores)
    std_error_f1 = np.std(f1_scores, ddof=1) / np.sqrt(len(f1_scores))  # Standard error
    results.append({
        'Model': 'xgboost model, xgboost feature Pfams, training data with contamination and low sequence abundance, updated software',
        'Trophic mode': class_name,
        'Mean_F1_Score': mean_f1,
        'Std_Error_F1_Score': std_error_f1
    })

#create a DataFrame from the results
df = pd.DataFrame(results)

# Replace Trophic values with desired labels
df['Trophic mode'] = df['Trophic mode'].replace({'Trophic_0': 'Het', 'Trophic_1': 'Mix', 'Trophic_2': 'Phot'})

#define output csv path
csv_file_path = 'models_byClass_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [45]:
#random forest model, random forest features
#training data with contamination or low sequence abundance 
#updated software

In [46]:
#load random forest feature Pfams for training data that includes 
#contamination and low sequence transcriptomes
features = pd.read_csv('Extracted_Pfams_contaminationLowSeqsIncluded_rfModel_rfFeatures.csv')

In [47]:
#extract pfam column from features dataframe
features = features['pfam']

In [48]:
#get just the TPM values not the MMETSP entry IDs and trophic mode labels
#same training data as above
trainData = train.iloc[:, 2:]

In [49]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [50]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [51]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [52]:
#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)

In [53]:
np.random.seed(7)

#{'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1000} 
clf = RandomForestClassifier(max_depth=None,min_samples_leaf=3,min_samples_split=5,min_weight_fraction_leaf=0.0,n_estimators=1000)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores
data = {'F1_Scores': f1_scores}

#calculate mean and standard error of F1 scores
mean_f1 = data['F1_Scores'].mean()
std_error_f1 = data['F1_Scores'].std(ddof=1) / np.sqrt(len(data['F1_Scores']))

#create a new DataFrame with the results
df = pd.DataFrame({
    'Model': ['random forest model, random forest feature Pfams, training data with contamination and low sequence abundance, updated software'],
    'Mean_F1_Score': [mean_f1],
    'Std_Error_F1_Score': [std_error_f1]
})

#define output csv path
csv_file_path = 'model_overall_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [54]:
np.random.seed(7)

#{'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1000} 
clf = RandomForestClassifier(max_depth=None,min_samples_leaf=3,min_samples_split=5,min_weight_fraction_leaf=0.0,n_estimators=1000)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each trophic mode separately
trophic_names = [f'Trophic_{i}' for i in range(f1_scores.shape[1])]
data = {trophic_name: f1_scores[:, i] for i, trophic_name in enumerate(trophic_names)}

#prepare a list to store results
results = []

#loop through the data dictionary and calculate mean and standard error for each class
for class_name, f1_scores in data.items():
    mean_f1 = np.mean(f1_scores)
    std_error_f1 = np.std(f1_scores, ddof=1) / np.sqrt(len(f1_scores))  # Standard error
    results.append({
        'Model': 'random forest model, random forest feature Pfams, training data with contamination and low sequence abundance, updated software',
        'Trophic mode': class_name,
        'Mean_F1_Score': mean_f1,
        'Std_Error_F1_Score': std_error_f1
    })

#create a DataFrame from the results
df = pd.DataFrame(results)

# Replace Trophic values with desired labels
df['Trophic mode'] = df['Trophic mode'].replace({'Trophic_0': 'Het', 'Trophic_1': 'Mix', 'Trophic_2': 'Phot'})

#define output csv path
csv_file_path = 'models_byClass_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [55]:
#xgboost model, xgboost features
#training data without contamination or low sequence abundance
#updated software
#binary data rather than TPM

In [56]:
#load training data and labels
train = pd.read_csv('../trainingDataMarPRISM.csv')

#get just the TPM values not the MMETSP entry IDs and trophic mode labels
trainData = train.iloc[:, 2:]

#load feature Pfams for model
features = pd.read_csv('Extracted_Pfams_contaminationLowSeqsRemoved_xgModel_xgFeatures_binary.csv')

In [57]:
#covert training data TPM to binary
trainData = (trainData > 0).astype(int)

In [58]:
#extract pfam column from features dataframe
features = features['pfam']

In [59]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [60]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [61]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [62]:
#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)


In [63]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 0.5} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.05, max_depth=3, n_estimators=1000, reg_lambda=0.5)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores
data = {'F1_Scores': f1_scores}

#calculate mean and standard error of F1 scores
mean_f1 = data['F1_Scores'].mean()
std_error_f1 = data['F1_Scores'].std(ddof=1) / np.sqrt(len(data['F1_Scores']))

#create a new DataFrame with the results
df = pd.DataFrame({
    'Model': ['xgboost model, xgboost feature Pfams, training data without contamination and low sequence abundance, updated software, binary'],
    'Mean_F1_Score': [mean_f1],
    'Std_Error_F1_Score': [std_error_f1]
})

#define output csv path
csv_file_path = 'model_overall_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [64]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 1000, 'reg_lambda': 0.5} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.05, max_depth=3, n_estimators=1000, reg_lambda=0.5)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each trophic mode separately
trophic_names = [f'Trophic_{i}' for i in range(f1_scores.shape[1])]
data = {trophic_name: f1_scores[:, i] for i, trophic_name in enumerate(trophic_names)}

#prepare a list to store results
results = []

#loop through the data dictionary and calculate mean and standard error for each class
for class_name, f1_scores in data.items():
    mean_f1 = np.mean(f1_scores)
    std_error_f1 = np.std(f1_scores, ddof=1) / np.sqrt(len(f1_scores))  # Standard error
    results.append({
        'Model': 'xgboost model, xgboost feature Pfams, training data without contamination and low sequence abundance, updated software, binary',
        'Trophic mode': class_name,
        'Mean_F1_Score': mean_f1,
        'Std_Error_F1_Score': std_error_f1
    })

#create a DataFrame from the results
df = pd.DataFrame(results)

#replace Trophic values with desired labels
df['Trophic mode'] = df['Trophic mode'].replace({'Trophic_0': 'Het', 'Trophic_1': 'Mix', 'Trophic_2': 'Phot'})

#define output csv path
csv_file_path = 'models_byClass_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [65]:
#xgboost model, xgboost features
#training data without contamination or low sequence abundance
#updated software
#micromonas mix labels in training data coverted to phot labels

In [66]:
#load training data and labels
train = pd.read_csv('trainingData_micromonasMixToPhot.csv')

#get just the TPM values not the MMETSP entry IDs and trophic mode labels
trainData = train.iloc[:, 2:]

#load feature Pfams for model
features = pd.read_csv('Extracted_Pfams_contaminationLowSeqsRemoved_xgModel_xgFeatures_micromonasMixToPhot.csv')

In [67]:
#extract pfam column from features dataframe
features = features['pfam']

In [68]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [69]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [70]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
#X: Feature matrix (independent variables) from the DataFrame trainData
#y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
#minMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [71]:
#split the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(
    X,           # Features (input variables)
    y,           # Target variable (output/labels)
    test_size=0.4, # Proportion of the dataset to include in the test split (40%)
    random_state=0 # Random seed for reproducibility of the split
)

In [72]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 1000, 'reg_lambda': 1.0} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=10, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#perform cross-validation and store the F1 scores
f1_scores = cross_val_score(clf, X, y, cv=kf, scoring=f1_scorer)

#save F1 scores
data = {'F1_Scores': f1_scores}

#calculate mean and standard error of F1 scores
mean_f1 = data['F1_Scores'].mean()
std_error_f1 = data['F1_Scores'].std(ddof=1) / np.sqrt(len(data['F1_Scores']))

#create a new DataFrame with the results
df = pd.DataFrame({
    'Model': ['xgboost model, xgboost feature Pfams, training data without contamination and low sequence abundance, updated software, micromonas mix to phot'],
    'Mean_F1_Score': [mean_f1],
    'Std_Error_F1_Score': [std_error_f1]
})

#define output csv path
csv_file_path = 'model_overall_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)

In [73]:
np.random.seed(7)

#{'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 1000, 'reg_lambda': 1.0} 
clf = XGBClassifier(gamma=0.0, learning_rate=0.1, max_depth=10, n_estimators=1000, reg_lambda=1.0)

#define a custom scoring function for F1 score
def f1_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average=None)

#create a StratifiedShuffleSplit cross-validator
kf = StratifiedShuffleSplit(n_splits=6, random_state=7)

#initialize an empty list to store F1 scores for each class
all_f1_scores = []

#perform cross-validation and store the F1 scores for each fold
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    fold_f1_scores = f1_scorer(y_test, y_pred)
    all_f1_scores.append(fold_f1_scores)

#convert the list of arrays to a numpy array
f1_scores = np.array(all_f1_scores)

#save the F1 scores for each trophic mode separately
trophic_names = [f'Trophic_{i}' for i in range(f1_scores.shape[1])]
data = {trophic_name: f1_scores[:, i] for i, trophic_name in enumerate(trophic_names)}

#prepare a list to store results
results = []

#loop through the data dictionary and calculate mean and standard error for each class
for class_name, f1_scores in data.items():
    mean_f1 = np.mean(f1_scores)
    std_error_f1 = np.std(f1_scores, ddof=1) / np.sqrt(len(f1_scores))  # Standard error
    results.append({
        'Model': ['xgboost model, xgboost feature Pfams, training data without contamination and low sequence abundance, updated software, micromonas mix to phot'],
        'Trophic mode': class_name,
        'Mean_F1_Score': mean_f1,
        'Std_Error_F1_Score': std_error_f1
    })

#create a DataFrame from the results
df = pd.DataFrame(results)

#replace Trophic values with desired labels
df['Trophic mode'] = df['Trophic mode'].replace({'Trophic_0': 'Het', 'Trophic_1': 'Mix', 'Trophic_2': 'Phot'})

#define output csv path
csv_file_path = 'models_byClass_f1_scores.csv'

#check if the file already exists
if os.path.exists(csv_file_path):
    #append to the file without writing the header
    df.to_csv(csv_file_path, mode='a', header=False, index=False)
else:
    #write the DataFrame as a new file with header
    df.to_csv(csv_file_path, mode='w', header=True, index=False)