In [6]:
#load packages
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier

In [7]:
#load training data and labels
train = pd.read_csv('trainingDataMarPRISM.csv')

#get just the TPM values not the MMETSP entry IDs and trophic mode labels
trainData = train.iloc[:, 2:]

#load feature Pfams for model
features = pd.read_csv('MarPRISM_featurePfams.csv')

#load your data, formatted as Pfam transcripts per million (TPM) per species bin and sample pair
data = pd.read_csv('exampleDataset.csv')

In [8]:
#remove period and numbers after period in pfam IDs if present
data.columns = [col.split('.')[0] for col in data.columns]

In [9]:
#load core transcribed genes (CTGs) for eukaryotes
coreCTGs = pd.read_csv('MarFERReT.v1.core_genes_eukaryota.csv')

In [10]:
#remove period and numbers after period in CTG IDs
coreCTGs['pfam_id'] = coreCTGs['pfam_id'].str.split('.').str[0]

In [11]:
##print warning if any of the species bin sample pairs have less than 70% 
##of the CTGs expressed

#filter columns in `data` based on `pfam_id` values in `coreCTGs`
filtered_columns = ['sample'] + coreCTGs['pfam_id'].tolist()
filtered_data = data.loc[:, data.columns.intersection(filtered_columns)]

#calculate proportion of pfam_id values > 0 for each row
filtered_data['non_zero_percentage'] = (
    (filtered_data.iloc[:, 1:] > 0).sum(axis=1) / (len(filtered_columns) - 1)
)

#identify rows with less than 0.7 non-zero values
result = filtered_data[filtered_data['non_zero_percentage'] < 0.7]

#print warning and the sample names and corresponding rows with less than 70% CTGs expressed
#remove these samples from dataframe so trophic predictions are not made for them
if not result.empty:
    print("Warning: Some species bin sample pairs have less than 70% of CTGs expressed. These species bin sample pairs will be removed from dataframe.")
    print(result[['sample', 'non_zero_percentage']])


#remove samples with less than 70% of CTGs expressed from the original dataframe
data = data[~data['sample'].isin(result['sample'])]

In [12]:
#extract pfam column from features dataframe
features = features['pfam']

In [13]:
#get the feature Pfams missing from dataframe
original_features = set(data.columns)
missing_features = set(features) - original_features

In [14]:
#fill in the TPM as 0 for the Pfams missing from your dataframe
for feature in missing_features:
    data[feature] = 0

In [15]:
#make a dataframe with the first column of your dataframe which should have sample IDs
samples = data.iloc[:, 0]

#exclude column with sample IDs
data = data.iloc[:, 1:]

In [16]:
#get just data for the feature Pfams from your dataset
data = data[features]

In [17]:
#need to encode trophic labels as numbers (0,1,2)
le = LabelEncoder()

In [18]:
#get just data for the feature Pfams from training data
trainData = trainData[features]

In [78]:
#assign feature matrix and target vector
X, y = trainData, le.fit_transform(train['Trophic mode'])
# X: Feature matrix (independent variables) from the DataFrame trainData
# y: Target vector (dependent variable), where the 'Trophic mode' column is label-encoded using `LabelEncoder`.

#initialize a MinMaxScaler instance
scaler = MinMaxScaler()
# MinMaxScaler scales features to a specified range, typically [0, 1], which can improve the performance of machine learning models

#scale feature matrix
X = scaler.fit_transform(X)

In [79]:
#apply fitted MinMaxScaler to your dataset
data = scaler.transform(data)

In [80]:
#initialize the XGBClassifier with hyperparameters determined from gridsearch
model = XGBClassifier(
    gamma=0.0,          
    learning_rate=0.1,  
    max_depth=3,        
    n_estimators=1000,  
    reg_lambda=1.0     
)

#train model using the training data
model.fit(X, y)

In [81]:
#make trophic predictions with trained model
predictions = model.predict(data)

In [82]:
#put predictions into a dataframe with your sample IDs
predictions = pd.DataFrame(data={'sample':samples, 'trophic_prediction':predictions})

In [83]:
#put in trophic predictions as words rather than numbers
predictions['trophic_prediction'] = predictions['trophic_prediction'].apply(
    lambda x: "Mix" if x == 1 else ("Het" if x == 0 else "Phot")
)

In [33]:
predictions.to_csv('exampleDataset_trophicPredictions.csv',index=False)